Spaces:

Heng2004
/

Laos-Natural-Science-Chatbot

Running

App Files Files Community

Laos-Natural-Science-Chatbot / loader.py

Heng2004

Update loader.py

dd57cad verified about 9 hours ago

raw

history blame contribute delete

8.82 kB

	# loader.py
	import os
	import json
	from typing import List, Dict, Any

	from huggingface_hub import hf_hub_download, HfApi
	DATASET_REPO_ID = "Heng2004/lao-science-qa-store"
	DATASET_FILENAME = "manual_qa.jsonl"

	import qa_store

	# Base paths (make them relative to this file)
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DATA_DIR = os.path.join(BASE_DIR, "data")

	CURRICULUM_PATH = os.path.join(DATA_DIR, "M_1_U_1.jsonl")
	MANUAL_QA_PATH = os.path.join(DATA_DIR, "manual_qa.jsonl")

	GLOSSARY_PATH = os.path.join(DATA_DIR, "glossary.jsonl")


	def sync_upload_manual_qa() -> str:
	"""
	Upload the local manual_qa.jsonl back to the Hugging Face Dataset repo.
	Returns a status message string to display in the UI.
	"""
	if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
	return "⚠️ Upload Skipped (Repo ID not set)"

	print(f"[INFO] Uploading {DATASET_FILENAME} to {DATASET_REPO_ID}...")
	try:
	from huggingface_hub import HfApi

	api = HfApi()

	api.upload_file(
	path_or_fileobj=MANUAL_QA_PATH,
	path_in_repo=DATASET_FILENAME,
	repo_id=DATASET_REPO_ID,
	repo_type="dataset",
	commit_message="Teacher Panel: Updated Q&A data"
	)
	print("[INFO] Upload success!")
	return "☁️ Cloud Upload Success"

	except Exception as e:
	print(f"[ERROR] Could not upload manual_qa.jsonl: {e}")
	return f"⚠️ Cloud Upload Failed: {e}"

	def sync_download_manual_qa() -> None:
	"""
	Download the latest manual_qa.jsonl from the Hugging Face Dataset repo
	at startup so we don't lose previous teacher edits.
	"""
	if not DATASET_REPO_ID or "YOUR_USERNAME" in DATASET_REPO_ID:
	print("[WARN] DATASET_REPO_ID is not set. Skipping download.")
	return

	print(f"[INFO] Downloading {DATASET_FILENAME} from {DATASET_REPO_ID}...")
	try:
	from huggingface_hub import hf_hub_download

	# Download file to a temporary path first
	downloaded_path = hf_hub_download(
	repo_id=DATASET_REPO_ID,
	filename=DATASET_FILENAME,
	repo_type="dataset",
	token=os.environ.get("HF_TOKEN") # Uses the Space's system token
	)

	# Copy it to our local data folder
	import shutil
	target_path = MANUAL_QA_PATH
	shutil.copy(downloaded_path, target_path)
	print("[INFO] Download success!")

	except Exception as e:
	print(f"[WARN] Could not download manual_qa.jsonl: {e}")
	print("[INFO] Starting with empty or local manual_qa.jsonl instead.")

	def load_curriculum() -> None:
	"""
	Load official textbook JSONL into qa_store.ENTRIES and AUTO_QA_KNOWLEDGE.
	"""
	qa_store.ENTRIES.clear()
	qa_store.AUTO_QA_KNOWLEDGE.clear()

	if not os.path.exists(CURRICULUM_PATH):
	print(f"[WARN] Curriculum file not found: {CURRICULUM_PATH}")
	qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດຖືກໂຫຼດ."
	return


	with open(CURRICULUM_PATH, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	obj: Dict[str, Any] = json.loads(line)
	except json.JSONDecodeError:
	print("[WARN] Skipping invalid JSON line in curriculum file.")
	continue


	if "text" not in obj:
	continue

	qa_store.ENTRIES.append(obj)

	for pair in obj.get("qa", []):
	q = (pair.get("q") or "").strip()
	a = (pair.get("a") or "").strip()
	if not q or not a:
	continue
	norm_q = qa_store.normalize_question(q)
	qa_store.AUTO_QA_KNOWLEDGE.append(
	{
	"norm_q": norm_q,
	"q": q,
	"a": a,
	"source": "auto",
	"id": obj.get("id", ""),
	}
	)

	if qa_store.ENTRIES:
	qa_store.RAW_KNOWLEDGE = "\n\n".join(e["text"] for e in qa_store.ENTRIES)
	else:
	qa_store.RAW_KNOWLEDGE = "ຍັງບໍ່ມີຂໍ້ມູນປະຫວັດສາດທີ່ອ່ານໄດ້."


	def load_glossary() -> None:
	"""Load glossary entries into qa_store.GLOSSARY."""
	qa_store.GLOSSARY.clear()

	if not os.path.exists(GLOSSARY_PATH):
	print(f"[WARN] Glossary file not found: {GLOSSARY_PATH}")
	return

	with open(GLOSSARY_PATH, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	obj = json.loads(line)
	except json.JSONDecodeError:
	print("[WARN] Skipping invalid glossary JSON line")
	continue
	qa_store.GLOSSARY.append(obj)

	print(f"[INFO] Loaded {len(qa_store.GLOSSARY)} glossary terms.")


	def load_manual_qa() -> None:
	"""
	Load manual_qa.jsonl into qa_store.MANUAL_QA_LIST and MANUAL_QA_INDEX.
	"""
	qa_store.MANUAL_QA_LIST.clear()
	qa_store.MANUAL_QA_INDEX.clear()
	max_num = 0

	if not os.path.exists(MANUAL_QA_PATH):
	print(f"[WARN] Manual QA file not found: {MANUAL_QA_PATH}")
	qa_store.NEXT_MANUAL_ID = 1
	return


	with open(MANUAL_QA_PATH, "r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	obj = json.loads(line)
	except json.JSONDecodeError:
	print("[WARN] Skipping invalid JSON line in manual QA file.")
	continue


	q = (obj.get("q") or "").strip()
	a = (obj.get("a") or "").strip()
	if not q or not a:
	continue

	entry_id = str(obj.get("id") or "")
	if not entry_id:
	max_num += 1
	entry_id = f"manual_{max_num:04d}"

	# track biggest number in id
	import re as _re

	m = _re.search(r"(\d+)$", entry_id)
	if m:
	max_num = max(max_num, int(m.group(1)))

	norm_q = qa_store.normalize_question(q)
	entry = {
	"id": entry_id,
	"q": q,
	"a": a,
	"norm_q": norm_q,
	}
	qa_store.MANUAL_QA_LIST.append(entry)
	qa_store.MANUAL_QA_INDEX[norm_q] = entry

	qa_store.NEXT_MANUAL_ID = max_num + 1 if max_num > 0 else 1


	# loader.py

	def generate_new_manual_id() -> str:
	"""
	Generate the smallest free manual_XXXX ID based on the
	current MANUAL_QA_LIST (so gaps like 11 after delete
	are reused).
	"""
	import re as _re

	used_nums = set()

	# collect all numbers that are already used in IDs
	for e in qa_store.MANUAL_QA_LIST:
	raw_id = str(e.get("id") or "")
	m = _re.search(r"(\d+)$", raw_id)
	if m:
	used_nums.add(int(m.group(1)))

	# find the smallest positive integer that is not used
	i = 1
	while i in used_nums:
	i += 1

	# keep the global counter roughly in sync (optional)
	qa_store.NEXT_MANUAL_ID = i + 1

	return f"manual_{i:04d}"



	def save_manual_qa_file() -> None:
	"""
	Persist MANUAL_QA_LIST to data/manual_qa.jsonl.
	"""
	os.makedirs(os.path.dirname(MANUAL_QA_PATH), exist_ok=True)
	with open(MANUAL_QA_PATH, "w", encoding="utf-8") as f:
	for e in qa_store.MANUAL_QA_LIST:
	obj = {"id": e["id"], "q": e["q"], "a": e["a"]}
	f.write(json.dumps(obj, ensure_ascii=False) + "\n")


	def rebuild_combined_qa() -> None:
	"""
	Combine auto and manual QA into QA_INDEX & ALL_QA_KNOWLEDGE.
	Manual answers override auto ones if same normalized question.
	"""
	qa_store.QA_INDEX.clear()
	qa_store.ALL_QA_KNOWLEDGE.clear()

	# auto first
	for item in qa_store.AUTO_QA_KNOWLEDGE:
	norm_q = item["norm_q"]
	qa_store.QA_INDEX[norm_q] = item["a"]
	qa_store.ALL_QA_KNOWLEDGE.append(item)

	# manual overrides
	for e in qa_store.MANUAL_QA_LIST:
	item = {
	"norm_q": e["norm_q"],
	"q": e["q"],
	"a": e["a"],
	"source": "manual",
	"id": e["id"],
	}
	qa_store.QA_INDEX[item["norm_q"]] = item["a"]
	qa_store.ALL_QA_KNOWLEDGE.append(item)


	def manual_qa_table_data() -> List[List[str]]:
	"""
	Table rows for Teacher Panel.
	"""
	return [[e["id"], e["q"], e["a"]] for e in qa_store.MANUAL_QA_LIST]