chore: update something
Browse files- docsifer/service.py +13 -13
- poetry.lock +0 -0
- requirements.txt +1 -1
docsifer/service.py
CHANGED
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import logging
|
|
|
|
| 5 |
# import tempfile
|
| 6 |
|
| 7 |
import requests.cookies
|
|
@@ -149,7 +150,7 @@ class DocsiferService:
|
|
| 149 |
new_filename = f"{src.stem}{guessed_ext}"
|
| 150 |
tmp_path = src.parent / new_filename
|
| 151 |
tmp_path.write_bytes(src.read_bytes())
|
| 152 |
-
|
| 153 |
|
| 154 |
logger.info(
|
| 155 |
"Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
|
|
@@ -160,8 +161,8 @@ class DocsiferService:
|
|
| 160 |
)
|
| 161 |
|
| 162 |
# Perform HTML cleanup if requested.
|
| 163 |
-
|
| 164 |
-
|
| 165 |
|
| 166 |
filename = new_filename
|
| 167 |
source = tmp_path
|
|
@@ -173,23 +174,22 @@ class DocsiferService:
|
|
| 173 |
md_converter = self._basic_markitdown
|
| 174 |
|
| 175 |
# Load cookies if provided in the HTTP config.
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
|
| 184 |
try:
|
| 185 |
result_obj = md_converter.convert(source)
|
| 186 |
-
print("result_obj:\n", result_obj)
|
| 187 |
except Exception as e:
|
| 188 |
logger.error("MarkItDown conversion failed: %s", e)
|
| 189 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
| 190 |
|
| 191 |
-
|
| 192 |
-
|
| 193 |
|
| 194 |
# Count tokens in the resulting markdown text.
|
| 195 |
token_count = self._count_tokens(result_obj.text_content)
|
|
|
|
| 2 |
|
| 3 |
import asyncio
|
| 4 |
import logging
|
| 5 |
+
|
| 6 |
# import tempfile
|
| 7 |
|
| 8 |
import requests.cookies
|
|
|
|
| 150 |
new_filename = f"{src.stem}{guessed_ext}"
|
| 151 |
tmp_path = src.parent / new_filename
|
| 152 |
tmp_path.write_bytes(src.read_bytes())
|
| 153 |
+
src.unlink()
|
| 154 |
|
| 155 |
logger.info(
|
| 156 |
"Using temp file: %s, MIME type: %s, Guessed ext: %s, Existing: %s",
|
|
|
|
| 161 |
)
|
| 162 |
|
| 163 |
# Perform HTML cleanup if requested.
|
| 164 |
+
if cleanup and guessed_ext.lower() in (".html", ".htm"):
|
| 165 |
+
self._maybe_cleanup_html(tmp_path)
|
| 166 |
|
| 167 |
filename = new_filename
|
| 168 |
source = tmp_path
|
|
|
|
| 174 |
md_converter = self._basic_markitdown
|
| 175 |
|
| 176 |
# Load cookies if provided in the HTTP config.
|
| 177 |
+
if http_config:
|
| 178 |
+
if "cookies" in http_config:
|
| 179 |
+
requests.cookies.cookiejar_from_dict(
|
| 180 |
+
http_config["cookies"],
|
| 181 |
+
requests.cookies.RequestsCookieJar,
|
| 182 |
+
overwrite=True,
|
| 183 |
+
)
|
| 184 |
|
| 185 |
try:
|
| 186 |
result_obj = md_converter.convert(source)
|
|
|
|
| 187 |
except Exception as e:
|
| 188 |
logger.error("MarkItDown conversion failed: %s", e)
|
| 189 |
raise RuntimeError(f"Conversion failed for '{source}': {e}")
|
| 190 |
|
| 191 |
+
if isinstance(source, Path) and source.exists():
|
| 192 |
+
source.unlink()
|
| 193 |
|
| 194 |
# Count tokens in the resulting markdown text.
|
| 195 |
token_count = self._count_tokens(result_obj.text_content)
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
CHANGED
|
@@ -5,7 +5,7 @@ requests
|
|
| 5 |
pydantic
|
| 6 |
cachetools
|
| 7 |
scuid
|
| 8 |
-
markitdown
|
| 9 |
upstash_redis==1.2.0
|
| 10 |
openai==1.59.7
|
| 11 |
pyquery==2.0.1
|
|
|
|
| 5 |
pydantic
|
| 6 |
cachetools
|
| 7 |
scuid
|
| 8 |
+
markitdown @ git+https://github.com/lh0x00/markitdown@c5e3ab4
|
| 9 |
upstash_redis==1.2.0
|
| 10 |
openai==1.59.7
|
| 11 |
pyquery==2.0.1
|