Spaces:

lamhieu
/

docsifer

Running

App Files Files Community

lamhieu commited on Feb 28

Commit

86b0060

1 Parent(s): 25ddcaa

chore: update something

Browse files

Files changed (3) hide show

docsifer/service.py +13 -13
poetry.lock +0 -0
requirements.txt +1 -1

docsifer/service.py CHANGED Viewed

@@ -2,6 +2,7 @@ from __future__ import annotations
 import asyncio
 import logging
 # import tempfile
 import requests.cookies
@@ -149,7 +150,7 @@ class DocsiferService:
                 new_filename = f"{src.stem}{guessed_ext}"
             tmp_path = src.parent / new_filename
             tmp_path.write_bytes(src.read_bytes())
-            # src.unlink()
             logger.info(
                 "Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
@@ -160,8 +161,8 @@ class DocsiferService:
             )
             # Perform HTML cleanup if requested.
-            # if cleanup and guessed_ext.lower() in (".html", ".htm"):
-            #     self._maybe_cleanup_html(tmp_path)
             filename = new_filename
             source = tmp_path
@@ -173,23 +174,22 @@ class DocsiferService:
             md_converter = self._basic_markitdown
         # Load cookies if provided in the HTTP config.
-        # if http_config:
-        #     if "cookies" in http_config:
-        #         requests.cookies.cookiejar_from_dict(
-        #             http_config["cookies"],
-        #             requests.cookies.RequestsCookieJar,
-        #             overwrite=True,
-        #         )
         try:
             result_obj = md_converter.convert(source)
-            print("result_obj:\n", result_obj)
         except Exception as e:
             logger.error("MarkItDown conversion failed: %s", e)
             raise RuntimeError(f"Conversion failed for '{source}': {e}")
-        # if isinstance(source, Path) and source.exists():
-        #     source.unlink()
         # Count tokens in the resulting markdown text.
         token_count = self._count_tokens(result_obj.text_content)

 import asyncio
 import logging
 # import tempfile
 import requests.cookies
                 new_filename = f"{src.stem}{guessed_ext}"
             tmp_path = src.parent / new_filename
             tmp_path.write_bytes(src.read_bytes())
+            src.unlink()
             logger.info(
                 "Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
             )
             # Perform HTML cleanup if requested.
+            if cleanup and guessed_ext.lower() in (".html", ".htm"):
+                self._maybe_cleanup_html(tmp_path)
             filename = new_filename
             source = tmp_path
             md_converter = self._basic_markitdown
         # Load cookies if provided in the HTTP config.
+        if http_config:
+            if "cookies" in http_config:
+                requests.cookies.cookiejar_from_dict(
+                    http_config["cookies"],
+                    requests.cookies.RequestsCookieJar,
+                    overwrite=True,
+                )
         try:
             result_obj = md_converter.convert(source)
         except Exception as e:
             logger.error("MarkItDown conversion failed: %s", e)
             raise RuntimeError(f"Conversion failed for '{source}': {e}")
+        if isinstance(source, Path) and source.exists():
+            source.unlink()
         # Count tokens in the resulting markdown text.
         token_count = self._count_tokens(result_obj.text_content)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -5,7 +5,7 @@ requests
 pydantic
 cachetools
 scuid
-markitdown==0.0.1a3
 upstash_redis==1.2.0
 openai==1.59.7
 pyquery==2.0.1

 pydantic
 cachetools
 scuid
+markitdown @ git+https://github.com/lh0x00/markitdown@c5e3ab4
 upstash_redis==1.2.0
 openai==1.59.7
 pyquery==2.0.1