feat: support convert from url and more
Browse files- docsifer/__init__.py +135 -53
- docsifer/router.py +42 -15
- requirements.txt +1 -0
docsifer/__init__.py
CHANGED
|
@@ -17,8 +17,12 @@ from pathlib import Path
|
|
| 17 |
from scuid import scuid
|
| 18 |
|
| 19 |
|
| 20 |
-
# Filter out /v1 requests from the access log
|
| 21 |
class LogFilter(logging.Filter):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def filter(self, record):
|
| 23 |
# Only keep log records that contain "/v1" in the request path
|
| 24 |
if record.args and len(record.args) >= 3:
|
|
@@ -30,7 +34,6 @@ class LogFilter(logging.Filter):
|
|
| 30 |
logger = logging.getLogger("uvicorn.access")
|
| 31 |
logger.addFilter(LogFilter())
|
| 32 |
|
| 33 |
-
# Application metadata
|
| 34 |
__version__ = "1.0.0"
|
| 35 |
__author__ = "lamhieu"
|
| 36 |
__description__ = "Docsifer: Efficient Data Conversion to Markdown."
|
|
@@ -46,11 +49,10 @@ __metadata__ = {
|
|
| 46 |
"spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
|
| 47 |
}
|
| 48 |
|
| 49 |
-
#
|
| 50 |
DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
|
| 51 |
DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
|
| 52 |
|
| 53 |
-
# Markdown description for the main interface
|
| 54 |
APP_DESCRIPTION = f"""
|
| 55 |
# 📝 **Docsifer: Convert Your Documents to Markdown**
|
| 56 |
|
|
@@ -60,7 +62,7 @@ Welcome to **Docsifer**, a specialized service that converts your files—like P
|
|
| 60 |
|
| 61 |
- **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
|
| 62 |
- **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
|
| 63 |
-
- **Privacy-Focused**: We never store user data; all processing is
|
| 64 |
- **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
|
| 65 |
- **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
|
| 66 |
|
|
@@ -68,7 +70,6 @@ Welcome to **Docsifer**, a specialized service that converts your files—like P
|
|
| 68 |
- [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
|
| 69 |
"""
|
| 70 |
|
| 71 |
-
# Initialize FastAPI application
|
| 72 |
app = FastAPI(
|
| 73 |
title="Docsifer Service API",
|
| 74 |
description=__description__,
|
|
@@ -77,7 +78,7 @@ app = FastAPI(
|
|
| 77 |
redoc_url="/redoc",
|
| 78 |
)
|
| 79 |
|
| 80 |
-
# Configure CORS
|
| 81 |
app.add_middleware(
|
| 82 |
CORSMiddleware,
|
| 83 |
allow_origins=["*"], # Adjust if needed for specific domains
|
|
@@ -86,34 +87,40 @@ app.add_middleware(
|
|
| 86 |
allow_headers=["*"],
|
| 87 |
)
|
| 88 |
|
| 89 |
-
# Import and include your existing router (
|
| 90 |
from .router import router
|
| 91 |
|
| 92 |
app.include_router(router, prefix="/v1")
|
| 93 |
|
| 94 |
|
| 95 |
def call_convert_api(
|
| 96 |
-
file_obj: bytes,
|
| 97 |
-
filename: str,
|
|
|
|
| 98 |
cleanup: bool = True,
|
| 99 |
openai_base_url: Optional[str] = None,
|
| 100 |
openai_api_key: Optional[str] = None,
|
| 101 |
openai_model: Optional[str] = None,
|
| 102 |
) -> Tuple[str, str]:
|
| 103 |
"""
|
| 104 |
-
|
| 105 |
-
If there's an error, the first return value is an error message (str),
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""
|
| 113 |
-
|
| 114 |
-
if file_obj is None:
|
| 115 |
-
return ("❌ No file was uploaded.", "")
|
| 116 |
-
|
| 117 |
# Build the "openai" object
|
| 118 |
openai_dict = {}
|
| 119 |
if openai_api_key and openai_api_key.strip():
|
|
@@ -127,17 +134,27 @@ def call_convert_api(
|
|
| 127 |
settings_dict = {"cleanup": cleanup}
|
| 128 |
|
| 129 |
data = {
|
| 130 |
-
#
|
| 131 |
"openai": json.dumps(openai_dict),
|
| 132 |
"settings": json.dumps(settings_dict),
|
| 133 |
}
|
| 134 |
|
|
|
|
| 135 |
if len(openai_dict) <= 3:
|
| 136 |
data.pop("openai")
|
| 137 |
|
| 138 |
-
#
|
| 139 |
-
files = {
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
try:
|
| 142 |
response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
|
| 143 |
except requests.exceptions.RequestException as e:
|
|
@@ -146,14 +163,15 @@ def call_convert_api(
|
|
| 146 |
if response.status_code != 200:
|
| 147 |
return (f"❌ API Error {response.status_code}: {response.text}", "")
|
| 148 |
|
|
|
|
| 149 |
try:
|
| 150 |
converted = response.json()
|
| 151 |
-
#
|
| 152 |
markdown_content = converted["markdown"]
|
| 153 |
except Exception as e:
|
| 154 |
return (f"❌ Error parsing JSON: {str(e)}", "")
|
| 155 |
|
| 156 |
-
# Write the returned Markdown to a
|
| 157 |
with tempfile.NamedTemporaryFile(
|
| 158 |
mode="w+", suffix=".md", dir="/tmp", delete=False
|
| 159 |
) as tmp_file:
|
|
@@ -165,8 +183,17 @@ def call_convert_api(
|
|
| 165 |
|
| 166 |
def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 167 |
"""
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
"""
|
| 171 |
try:
|
| 172 |
response = requests.get(DOCSIFER_STATS_URL, timeout=10)
|
|
@@ -186,8 +213,10 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
| 186 |
tokens_data = data.get("tokens", {})
|
| 187 |
|
| 188 |
def build_stats_df(bucket: dict) -> pd.DataFrame:
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
| 191 |
all_models = set()
|
| 192 |
for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
|
| 193 |
period_dict = bucket.get(period_key, {})
|
|
@@ -219,21 +248,31 @@ def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
| 219 |
|
| 220 |
def create_main_interface():
|
| 221 |
"""
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
"""
|
| 227 |
with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
|
| 228 |
gr.Markdown(APP_DESCRIPTION)
|
| 229 |
|
| 230 |
with gr.Tab("Conversion Playground"):
|
| 231 |
-
gr.Markdown("### Convert your files to Markdown with Docsifer.")
|
| 232 |
|
| 233 |
with gr.Row():
|
|
|
|
| 234 |
with gr.Column():
|
| 235 |
file_input = gr.File(
|
| 236 |
-
label="Upload File",
|
| 237 |
file_types=[
|
| 238 |
".pdf",
|
| 239 |
".docx",
|
|
@@ -251,6 +290,11 @@ def create_main_interface():
|
|
| 251 |
type="binary",
|
| 252 |
)
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
with gr.Accordion("OpenAI Configuration (Optional)", open=False):
|
| 255 |
gr.Markdown(
|
| 256 |
"Provide these if you'd like **LLM-assisted** extraction. "
|
|
@@ -275,7 +319,8 @@ def create_main_interface():
|
|
| 275 |
|
| 276 |
with gr.Accordion("Conversion Settings", open=True):
|
| 277 |
gr.Markdown(
|
| 278 |
-
"Enable to remove <style> tags or hidden elements
|
|
|
|
| 279 |
)
|
| 280 |
cleanup_toggle = gr.Checkbox(
|
| 281 |
label="Enable Cleanup",
|
|
@@ -284,13 +329,12 @@ def create_main_interface():
|
|
| 284 |
|
| 285 |
convert_btn = gr.Button("Convert")
|
| 286 |
|
|
|
|
| 287 |
with gr.Column():
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
)
|
| 293 |
-
# Set visible=True so the user always sees a small download button
|
| 294 |
download_file = gr.File(
|
| 295 |
label="Download",
|
| 296 |
interactive=False,
|
|
@@ -309,32 +353,64 @@ def create_main_interface():
|
|
| 309 |
-F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
|
| 310 |
-F "settings={\\"cleanup\\":true}"
|
| 311 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
"""
|
| 313 |
)
|
| 314 |
|
| 315 |
-
|
|
|
|
| 316 |
"""
|
| 317 |
-
|
| 318 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
"""
|
| 320 |
-
|
| 321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
-
|
| 324 |
markdown, temp_md_path = call_convert_api(
|
| 325 |
file_obj=file_bytes,
|
| 326 |
filename=unique_name,
|
|
|
|
| 327 |
openai_base_url=base_url,
|
| 328 |
openai_api_key=api_key,
|
| 329 |
openai_model=model_id,
|
| 330 |
cleanup=cleanup,
|
| 331 |
)
|
|
|
|
| 332 |
return markdown, temp_md_path
|
| 333 |
|
|
|
|
| 334 |
convert_btn.click(
|
| 335 |
fn=on_convert,
|
| 336 |
inputs=[
|
| 337 |
file_input,
|
|
|
|
| 338 |
openai_base_url,
|
| 339 |
openai_api_key,
|
| 340 |
openai_model,
|
|
@@ -348,6 +424,7 @@ def create_main_interface():
|
|
| 348 |
"View Docsifer usage statistics (access count, token usage, etc.)"
|
| 349 |
)
|
| 350 |
stats_btn = gr.Button("Get Stats")
|
|
|
|
| 351 |
access_df = gr.DataFrame(
|
| 352 |
label="Access Stats",
|
| 353 |
headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
|
|
@@ -359,6 +436,7 @@ def create_main_interface():
|
|
| 359 |
interactive=False,
|
| 360 |
)
|
| 361 |
|
|
|
|
| 362 |
stats_btn.click(
|
| 363 |
fn=call_stats_api_df,
|
| 364 |
inputs=[],
|
|
@@ -368,17 +446,21 @@ def create_main_interface():
|
|
| 368 |
return demo
|
| 369 |
|
| 370 |
|
| 371 |
-
# Build our Gradio interface and mount it at the root path
|
| 372 |
main_interface = create_main_interface()
|
| 373 |
mount_gradio_app(app, main_interface, path="/")
|
| 374 |
|
| 375 |
|
| 376 |
-
# Startup / Shutdown events
|
| 377 |
@app.on_event("startup")
|
| 378 |
async def startup_event():
|
|
|
|
|
|
|
|
|
|
| 379 |
logger.info("Docsifer Service is starting up...")
|
| 380 |
|
| 381 |
|
| 382 |
@app.on_event("shutdown")
|
| 383 |
async def shutdown_event():
|
|
|
|
|
|
|
|
|
|
| 384 |
logger.info("Docsifer Service is shutting down.")
|
|
|
|
| 17 |
from scuid import scuid
|
| 18 |
|
| 19 |
|
|
|
|
| 20 |
class LogFilter(logging.Filter):
|
| 21 |
+
"""
|
| 22 |
+
A custom logging filter that only keeps log records containing '/v1'
|
| 23 |
+
in the request path. This helps to filter out other logs and reduce noise.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
def filter(self, record):
|
| 27 |
# Only keep log records that contain "/v1" in the request path
|
| 28 |
if record.args and len(record.args) >= 3:
|
|
|
|
| 34 |
logger = logging.getLogger("uvicorn.access")
|
| 35 |
logger.addFilter(LogFilter())
|
| 36 |
|
|
|
|
| 37 |
__version__ = "1.0.0"
|
| 38 |
__author__ = "lamhieu"
|
| 39 |
__description__ = "Docsifer: Efficient Data Conversion to Markdown."
|
|
|
|
| 49 |
"spaces": "https://huggingface.co/spaces/lh0x00/docsifer",
|
| 50 |
}
|
| 51 |
|
| 52 |
+
# Docsifer API Endpoints (can be replaced with your live URLs if desired)
|
| 53 |
DOCSIFER_API_URL = "http://localhost:7860/v1/convert"
|
| 54 |
DOCSIFER_STATS_URL = "http://localhost:7860/v1/stats"
|
| 55 |
|
|
|
|
| 56 |
APP_DESCRIPTION = f"""
|
| 57 |
# 📝 **Docsifer: Convert Your Documents to Markdown**
|
| 58 |
|
|
|
|
| 62 |
|
| 63 |
- **Open Source**: The entire Docsifer codebase is publicly available for review and contribution.
|
| 64 |
- **Efficient & Flexible**: Supports multiple file formats, ensuring quick and accurate Markdown conversion.
|
| 65 |
+
- **Privacy-Focused**: We never store user data; all processing is temporary. We only collect minimal anonymous usage statistics to count the number of calls and the number of tokens, nothing else.
|
| 66 |
- **Production-Ready**: Easy Docker deployment, interactive Gradio playground, and comprehensive REST API documentation.
|
| 67 |
- **Community & Collaboration**: Contribute on [GitHub]({__metadata__["github"]}) or try it out on [Hugging Face Spaces]({__metadata__["spaces"]}).
|
| 68 |
|
|
|
|
| 70 |
- [Documentation]({__metadata__["docs"]}) | [GitHub]({__metadata__["github"]}) | [Live Demo]({__metadata__["spaces"]})
|
| 71 |
"""
|
| 72 |
|
|
|
|
| 73 |
app = FastAPI(
|
| 74 |
title="Docsifer Service API",
|
| 75 |
description=__description__,
|
|
|
|
| 78 |
redoc_url="/redoc",
|
| 79 |
)
|
| 80 |
|
| 81 |
+
# Configure CORS (Cross-Origin Resource Sharing)
|
| 82 |
app.add_middleware(
|
| 83 |
CORSMiddleware,
|
| 84 |
allow_origins=["*"], # Adjust if needed for specific domains
|
|
|
|
| 87 |
allow_headers=["*"],
|
| 88 |
)
|
| 89 |
|
| 90 |
+
# Import and include your existing router (with /v1 endpoints)
|
| 91 |
from .router import router
|
| 92 |
|
| 93 |
app.include_router(router, prefix="/v1")
|
| 94 |
|
| 95 |
|
| 96 |
def call_convert_api(
|
| 97 |
+
file_obj: Optional[bytes],
|
| 98 |
+
filename: str = "",
|
| 99 |
+
url: Optional[str] = None,
|
| 100 |
cleanup: bool = True,
|
| 101 |
openai_base_url: Optional[str] = None,
|
| 102 |
openai_api_key: Optional[str] = None,
|
| 103 |
openai_model: Optional[str] = None,
|
| 104 |
) -> Tuple[str, str]:
|
| 105 |
"""
|
| 106 |
+
Call the /v1/convert endpoint, returning (markdown_content, md_file_path).
|
| 107 |
+
- If there's an error, the first return value is an error message (str),
|
| 108 |
+
the second is an empty string.
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
file_obj (Optional[bytes]): The raw file bytes to be sent. If None, 'url' is used.
|
| 112 |
+
filename (str): Name of the file (will be posted to the endpoint).
|
| 113 |
+
url (str, optional): URL to be converted (used only if file_obj is None).
|
| 114 |
+
cleanup (bool): Whether to enable cleanup mode for HTML files.
|
| 115 |
+
openai_base_url (str, optional): Base URL for OpenAI or compatible LLM.
|
| 116 |
+
openai_api_key (str, optional): API key for the LLM.
|
| 117 |
+
openai_model (str, optional): Model name to use for LLM-based extraction.
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
(str, str):
|
| 121 |
+
- markdown_content (str): The conversion result in Markdown form or an error message.
|
| 122 |
+
- tmp_md_path (str): The path to the temporary .md file for download.
|
| 123 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# Build the "openai" object
|
| 125 |
openai_dict = {}
|
| 126 |
if openai_api_key and openai_api_key.strip():
|
|
|
|
| 134 |
settings_dict = {"cleanup": cleanup}
|
| 135 |
|
| 136 |
data = {
|
| 137 |
+
# Must match the `Form(...)` fields named "openai" and "settings"
|
| 138 |
"openai": json.dumps(openai_dict),
|
| 139 |
"settings": json.dumps(settings_dict),
|
| 140 |
}
|
| 141 |
|
| 142 |
+
# If the user left the OpenAI fields blank, remove the `openai` key from data
|
| 143 |
if len(openai_dict) <= 3:
|
| 144 |
data.pop("openai")
|
| 145 |
|
| 146 |
+
# Decide if we're sending a file or a URL
|
| 147 |
+
files = {}
|
| 148 |
+
if file_obj:
|
| 149 |
+
# If file is provided, it takes priority
|
| 150 |
+
files = {"file": (filename, file_obj)}
|
| 151 |
+
data["url"] = "" # ensure 'url' is empty on the form
|
| 152 |
+
elif url and url.strip():
|
| 153 |
+
data["url"] = url.strip()
|
| 154 |
+
else:
|
| 155 |
+
return ("❌ Please upload a file or provide a URL.", "")
|
| 156 |
+
|
| 157 |
+
# Perform the POST request
|
| 158 |
try:
|
| 159 |
response = requests.post(DOCSIFER_API_URL, files=files, data=data, timeout=30)
|
| 160 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 163 |
if response.status_code != 200:
|
| 164 |
return (f"❌ API Error {response.status_code}: {response.text}", "")
|
| 165 |
|
| 166 |
+
# Parse the API response
|
| 167 |
try:
|
| 168 |
converted = response.json()
|
| 169 |
+
# Expected structure: { "filename": "...", "markdown": "..." }
|
| 170 |
markdown_content = converted["markdown"]
|
| 171 |
except Exception as e:
|
| 172 |
return (f"❌ Error parsing JSON: {str(e)}", "")
|
| 173 |
|
| 174 |
+
# Write the returned Markdown to a temp .md file
|
| 175 |
with tempfile.NamedTemporaryFile(
|
| 176 |
mode="w+", suffix=".md", dir="/tmp", delete=False
|
| 177 |
) as tmp_file:
|
|
|
|
| 183 |
|
| 184 |
def call_stats_api_df() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 185 |
"""
|
| 186 |
+
Call /v1/stats endpoint to retrieve analytics data and return two DataFrames:
|
| 187 |
+
- access_df: Access statistics
|
| 188 |
+
- tokens_df: Token usage statistics
|
| 189 |
+
|
| 190 |
+
Raises:
|
| 191 |
+
ValueError: If the stats endpoint fails or returns invalid data.
|
| 192 |
+
|
| 193 |
+
Returns:
|
| 194 |
+
Tuple[pd.DataFrame, pd.DataFrame]:
|
| 195 |
+
(access_df, tokens_df) with columns ["Model", "Total", "Daily",
|
| 196 |
+
"Weekly", "Monthly", "Yearly"].
|
| 197 |
"""
|
| 198 |
try:
|
| 199 |
response = requests.get(DOCSIFER_STATS_URL, timeout=10)
|
|
|
|
| 213 |
tokens_data = data.get("tokens", {})
|
| 214 |
|
| 215 |
def build_stats_df(bucket: dict) -> pd.DataFrame:
|
| 216 |
+
"""
|
| 217 |
+
Helper function to transform a nested dictionary (by period, by model)
|
| 218 |
+
into a tabular pandas DataFrame.
|
| 219 |
+
"""
|
| 220 |
all_models = set()
|
| 221 |
for period_key in ["total", "daily", "weekly", "monthly", "yearly"]:
|
| 222 |
period_dict = bucket.get(period_key, {})
|
|
|
|
| 248 |
|
| 249 |
def create_main_interface():
|
| 250 |
"""
|
| 251 |
+
Create a Gradio Blocks interface that includes:
|
| 252 |
+
1) 'Conversion Playground' Tab:
|
| 253 |
+
- File upload OR URL-based conversion
|
| 254 |
+
- Optional OpenAI configuration
|
| 255 |
+
- Convert button
|
| 256 |
+
- Display of conversion result as Markdown
|
| 257 |
+
- Downloadable .md file
|
| 258 |
+
2) 'Analytics Stats' Tab:
|
| 259 |
+
- Button to fetch usage statistics
|
| 260 |
+
- DataFrames for Access Stats and Token Stats
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Gradio Blocks instance that can be mounted into the FastAPI app.
|
| 264 |
"""
|
| 265 |
with gr.Blocks(title="Docsifer: Convert to Markdown", theme="default") as demo:
|
| 266 |
gr.Markdown(APP_DESCRIPTION)
|
| 267 |
|
| 268 |
with gr.Tab("Conversion Playground"):
|
| 269 |
+
gr.Markdown("### Convert your files or a URL to Markdown with Docsifer.")
|
| 270 |
|
| 271 |
with gr.Row():
|
| 272 |
+
# Left Column: File Upload, URL Input, Settings, Button
|
| 273 |
with gr.Column():
|
| 274 |
file_input = gr.File(
|
| 275 |
+
label="Upload File (optional)",
|
| 276 |
file_types=[
|
| 277 |
".pdf",
|
| 278 |
".docx",
|
|
|
|
| 290 |
type="binary",
|
| 291 |
)
|
| 292 |
|
| 293 |
+
url_input = gr.Textbox(
|
| 294 |
+
label="URL (optional)",
|
| 295 |
+
placeholder="Enter a URL if no file is uploaded",
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
with gr.Accordion("OpenAI Configuration (Optional)", open=False):
|
| 299 |
gr.Markdown(
|
| 300 |
"Provide these if you'd like **LLM-assisted** extraction. "
|
|
|
|
| 319 |
|
| 320 |
with gr.Accordion("Conversion Settings", open=True):
|
| 321 |
gr.Markdown(
|
| 322 |
+
"Enable to remove <style> tags or hidden elements "
|
| 323 |
+
"from `.html` files before conversion."
|
| 324 |
)
|
| 325 |
cleanup_toggle = gr.Checkbox(
|
| 326 |
label="Enable Cleanup",
|
|
|
|
| 329 |
|
| 330 |
convert_btn = gr.Button("Convert")
|
| 331 |
|
| 332 |
+
# Right Column: Conversion Result Display & Download
|
| 333 |
with gr.Column():
|
| 334 |
+
# Display the result as Markdown
|
| 335 |
+
output_md = gr.Markdown(label="Conversion Result (Markdown)")
|
| 336 |
+
|
| 337 |
+
# The user can still download the .md file
|
|
|
|
|
|
|
| 338 |
download_file = gr.File(
|
| 339 |
label="Download",
|
| 340 |
interactive=False,
|
|
|
|
| 353 |
-F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
|
| 354 |
-F "settings={\\"cleanup\\":true}"
|
| 355 |
```
|
| 356 |
+
|
| 357 |
+
**Convert from a URL (no file)**:
|
| 358 |
+
```bash
|
| 359 |
+
curl -X POST \\
|
| 360 |
+
"https://lamhieu-docsifer.hf.space/v1/convert" \\
|
| 361 |
+
-F "url=https://example.com/page.html" \\
|
| 362 |
+
-F "openai={\\"api_key\\":\\"sk-xxxxx\\",\\"model\\":\\"gpt-4o-mini\\",\\"base_url\\":\\"https://api.openai.com/v1\\"}" \\
|
| 363 |
+
-F "settings={\\"cleanup\\":true}"
|
| 364 |
+
```
|
| 365 |
"""
|
| 366 |
)
|
| 367 |
|
| 368 |
+
# Callback function triggered by convert_btn.click
|
| 369 |
+
def on_convert(file_bytes, url_str, base_url, api_key, model_id, cleanup):
|
| 370 |
"""
|
| 371 |
+
Converts the uploaded file or a URL to Markdown by calling the Docsifer
|
| 372 |
+
API. Returns the resulting Markdown content and path to the
|
| 373 |
+
temporary .md file for download.
|
| 374 |
+
|
| 375 |
+
Args:
|
| 376 |
+
file_bytes (bytes): The raw file content (None if not uploaded).
|
| 377 |
+
url_str (str): The URL to convert (only used if file_bytes is None).
|
| 378 |
+
base_url (str): The base URL for OpenAI or compatible LLM.
|
| 379 |
+
api_key (str): The API key for the LLM.
|
| 380 |
+
model_id (str): The model to use for the LLM.
|
| 381 |
+
cleanup (bool): Whether to enable cleanup on HTML files.
|
| 382 |
+
|
| 383 |
+
Returns:
|
| 384 |
+
(str, str):
|
| 385 |
+
- The Markdown content or error message.
|
| 386 |
+
- The path to the temp .md file for download.
|
| 387 |
"""
|
| 388 |
+
# If file is not provided, we attempt the URL approach
|
| 389 |
+
if not file_bytes and not url_str:
|
| 390 |
+
return "❌ Please upload a file or provide a URL.", None
|
| 391 |
+
|
| 392 |
+
# Create a unique temporary filename if file is present
|
| 393 |
+
unique_name = f"{scuid()}.tmp" if file_bytes else ""
|
| 394 |
|
| 395 |
+
# Call the convert API
|
| 396 |
markdown, temp_md_path = call_convert_api(
|
| 397 |
file_obj=file_bytes,
|
| 398 |
filename=unique_name,
|
| 399 |
+
url=url_str,
|
| 400 |
openai_base_url=base_url,
|
| 401 |
openai_api_key=api_key,
|
| 402 |
openai_model=model_id,
|
| 403 |
cleanup=cleanup,
|
| 404 |
)
|
| 405 |
+
|
| 406 |
return markdown, temp_md_path
|
| 407 |
|
| 408 |
+
# Link the on_convert function to the convert_btn
|
| 409 |
convert_btn.click(
|
| 410 |
fn=on_convert,
|
| 411 |
inputs=[
|
| 412 |
file_input,
|
| 413 |
+
url_input,
|
| 414 |
openai_base_url,
|
| 415 |
openai_api_key,
|
| 416 |
openai_model,
|
|
|
|
| 424 |
"View Docsifer usage statistics (access count, token usage, etc.)"
|
| 425 |
)
|
| 426 |
stats_btn = gr.Button("Get Stats")
|
| 427 |
+
|
| 428 |
access_df = gr.DataFrame(
|
| 429 |
label="Access Stats",
|
| 430 |
headers=["Model", "Total", "Daily", "Weekly", "Monthly", "Yearly"],
|
|
|
|
| 436 |
interactive=False,
|
| 437 |
)
|
| 438 |
|
| 439 |
+
# When the button is clicked, call_stats_api_df returns two dataframes
|
| 440 |
stats_btn.click(
|
| 441 |
fn=call_stats_api_df,
|
| 442 |
inputs=[],
|
|
|
|
| 446 |
return demo
|
| 447 |
|
| 448 |
|
|
|
|
| 449 |
main_interface = create_main_interface()
|
| 450 |
mount_gradio_app(app, main_interface, path="/")
|
| 451 |
|
| 452 |
|
|
|
|
| 453 |
@app.on_event("startup")
|
| 454 |
async def startup_event():
|
| 455 |
+
"""
|
| 456 |
+
Logs a startup message when the Docsifer Service is starting.
|
| 457 |
+
"""
|
| 458 |
logger.info("Docsifer Service is starting up...")
|
| 459 |
|
| 460 |
|
| 461 |
@app.on_event("shutdown")
|
| 462 |
async def shutdown_event():
|
| 463 |
+
"""
|
| 464 |
+
Logs a shutdown message when the Docsifer Service is shutting down.
|
| 465 |
+
"""
|
| 466 |
logger.info("Docsifer Service is shutting down.")
|
docsifer/router.py
CHANGED
|
@@ -4,10 +4,12 @@ import logging
|
|
| 4 |
import json
|
| 5 |
import tempfile
|
| 6 |
import os
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
|
| 10 |
from pydantic import BaseModel
|
|
|
|
| 11 |
|
| 12 |
from .service import DocsiferService
|
| 13 |
from .analytics import Analytics
|
|
@@ -34,17 +36,21 @@ class ConvertResponse(BaseModel):
|
|
| 34 |
@router.post("/convert", response_model=ConvertResponse)
|
| 35 |
async def convert_document(
|
| 36 |
background_tasks: BackgroundTasks,
|
| 37 |
-
file: UploadFile = File(
|
|
|
|
|
|
|
|
|
|
| 38 |
openai: str = Form("{}", description="OpenAI config as a JSON object"),
|
| 39 |
settings: str = Form("{}", description="Settings as a JSON object"),
|
| 40 |
):
|
| 41 |
"""
|
| 42 |
-
Convert a
|
| 43 |
-
|
| 44 |
-
-
|
| 45 |
-
-
|
| 46 |
"""
|
| 47 |
try:
|
|
|
|
| 48 |
try:
|
| 49 |
openai_config = json.loads(openai) if openai else {}
|
| 50 |
except json.JSONDecodeError:
|
|
@@ -57,22 +63,43 @@ async def convert_document(
|
|
| 57 |
|
| 58 |
cleanup = settings_config.get("cleanup", True)
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
)
|
| 68 |
|
| 69 |
-
# Track usage
|
| 70 |
background_tasks.add_task(analytics.access, token_count)
|
| 71 |
-
|
| 72 |
return ConvertResponse(**result)
|
| 73 |
|
| 74 |
except Exception as e:
|
| 75 |
-
msg = f"Failed to convert
|
| 76 |
logger.error(msg)
|
| 77 |
raise HTTPException(status_code=500, detail=msg)
|
| 78 |
|
|
|
|
| 4 |
import json
|
| 5 |
import tempfile
|
| 6 |
import os
|
| 7 |
+
import aiohttp
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
from fastapi import APIRouter, HTTPException, UploadFile, File, Form, BackgroundTasks
|
| 11 |
from pydantic import BaseModel
|
| 12 |
+
from scuid import scuid
|
| 13 |
|
| 14 |
from .service import DocsiferService
|
| 15 |
from .analytics import Analytics
|
|
|
|
| 36 |
@router.post("/convert", response_model=ConvertResponse)
|
| 37 |
async def convert_document(
|
| 38 |
background_tasks: BackgroundTasks,
|
| 39 |
+
file: UploadFile = File(None, description="File to convert"),
|
| 40 |
+
url: str = Form(
|
| 41 |
+
None, description="URL to convert (used only if no file is provided)"
|
| 42 |
+
),
|
| 43 |
openai: str = Form("{}", description="OpenAI config as a JSON object"),
|
| 44 |
settings: str = Form("{}", description="Settings as a JSON object"),
|
| 45 |
):
|
| 46 |
"""
|
| 47 |
+
Convert a file or an HTML page from a URL into Markdown.
|
| 48 |
+
If 'file' is provided, it has priority over 'url'.
|
| 49 |
+
- 'openai' is a JSON string with keys: {"api_key": "...", "base_url": "..."}
|
| 50 |
+
- 'settings' is a JSON string with keys: {"cleanup": bool}
|
| 51 |
"""
|
| 52 |
try:
|
| 53 |
+
# Parse configs
|
| 54 |
try:
|
| 55 |
openai_config = json.loads(openai) if openai else {}
|
| 56 |
except json.JSONDecodeError:
|
|
|
|
| 63 |
|
| 64 |
cleanup = settings_config.get("cleanup", True)
|
| 65 |
|
| 66 |
+
# If a file is provided, use the existing flow
|
| 67 |
+
if file is not None:
|
| 68 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 69 |
+
temp_path = Path(tmpdir) / file.filename
|
| 70 |
+
contents = await file.read()
|
| 71 |
+
temp_path.write_bytes(contents)
|
| 72 |
+
result, token_count = await docsifer_service.convert_file(
|
| 73 |
+
file_path=str(temp_path),
|
| 74 |
+
openai_config=openai_config,
|
| 75 |
+
cleanup=cleanup,
|
| 76 |
+
)
|
| 77 |
+
# Otherwise, fetch HTML from URL and convert
|
| 78 |
+
elif url:
|
| 79 |
+
async with aiohttp.ClientSession() as session:
|
| 80 |
+
async with session.get(url) as resp:
|
| 81 |
+
if resp.status != 200:
|
| 82 |
+
raise ValueError(f"Failed to fetch URL: status {resp.status}")
|
| 83 |
+
data = await resp.read()
|
| 84 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 85 |
+
temp_path = Path(tmpdir) / f"{scuid()}.html"
|
| 86 |
+
temp_path.write_bytes(data)
|
| 87 |
+
result, token_count = await docsifer_service.convert_file(
|
| 88 |
+
file_path=str(temp_path),
|
| 89 |
+
openai_config=openai_config,
|
| 90 |
+
cleanup=cleanup,
|
| 91 |
+
)
|
| 92 |
+
else:
|
| 93 |
+
raise HTTPException(
|
| 94 |
+
status_code=400, detail="Provide either 'file' or 'url'."
|
| 95 |
)
|
| 96 |
|
| 97 |
+
# Track usage
|
| 98 |
background_tasks.add_task(analytics.access, token_count)
|
|
|
|
| 99 |
return ConvertResponse(**result)
|
| 100 |
|
| 101 |
except Exception as e:
|
| 102 |
+
msg = f"Failed to convert content. Error: {str(e)}"
|
| 103 |
logger.error(msg)
|
| 104 |
raise HTTPException(status_code=500, detail=msg)
|
| 105 |
|
requirements.txt
CHANGED
|
@@ -13,3 +13,4 @@ scuid
|
|
| 13 |
python-magic
|
| 14 |
plotly
|
| 15 |
matplotlib
|
|
|
|
|
|
| 13 |
python-magic
|
| 14 |
plotly
|
| 15 |
matplotlib
|
| 16 |
+
aiohttp
|