#!/usr/bin/env python3
"""
Generate REALISTIC medical PDFs that match real-world documents.
Includes proper formatting, layouts, medical terminology, and varied PHI.
"""
import sys
import os
from pathlib import Path
from faker import Faker
import random
import json
from datetime import datetime, timedelta
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.units import inch
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
PageBreak, KeepTogether, Image as RLImage, Frame, PageTemplate
)
from reportlab.pdfgen import canvas
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT
class RealisticMedicalPDFGenerator:
"""Generate highly realistic medical documents."""
def __init__(self):
self.faker = Faker()
Faker.seed(42)
self.styles = getSampleStyleSheet()
self._setup_custom_styles()
def _setup_custom_styles(self):
"""Create custom styles for medical documents."""
# Header style
self.header_style = ParagraphStyle(
'MedicalHeader',
parent=self.styles['Heading1'],
fontSize=16,
textColor=colors.HexColor('#003366'),
alignment=TA_CENTER,
spaceAfter=6,
fontName='Helvetica-Bold'
)
# Facility name
self.facility_style = ParagraphStyle(
'Facility',
parent=self.styles['Normal'],
fontSize=14,
textColor=colors.HexColor('#004080'),
alignment=TA_CENTER,
spaceAfter=3,
fontName='Helvetica-Bold'
)
# Small text
self.small_style = ParagraphStyle(
'Small',
parent=self.styles['Normal'],
fontSize=8,
textColor=colors.grey,
alignment=TA_CENTER,
)
# Body text
self.body_style = ParagraphStyle(
'Body',
parent=self.styles['Normal'],
fontSize=10,
leading=14,
)
# Section header
self.section_style = ParagraphStyle(
'Section',
parent=self.styles['Heading2'],
fontSize=11,
textColor=colors.HexColor('#004080'),
spaceBefore=8,
spaceAfter=4,
fontName='Helvetica-Bold'
)
def create_prescription(self, patient_data, output_path):
"""Create a realistic prescription."""
doc = SimpleDocTemplate(str(output_path), pagesize=letter,
topMargin=0.5*inch, bottomMargin=0.5*inch)
story = []
annotations = []
# Facility header
clinic_name = f"{self.faker.company()} Medical Group"
clinic_address = self.faker.address().replace('\n', ', ')
clinic_phone = self.faker.phone_number()
clinic_fax = clinic_phone # Reuse for simplicity
story.append(Paragraph(clinic_name, self.facility_style))
story.append(Paragraph(clinic_address, self.small_style))
story.append(Paragraph(f"Phone: {clinic_phone} | Fax: {clinic_fax}", self.small_style))
story.append(Spacer(1, 0.2*inch))
# Prescription header with RX symbol
story.append(Paragraph("PRESCRIPTION", self.header_style))
story.append(Spacer(1, 0.15*inch))
# Patient information box
rx_date = datetime.now()
patient_info = [
['Patient Name:', patient_data['name']],
['Date of Birth:', patient_data['birth_date']],
['Address:', patient_data['address']],
['Phone:', patient_data['phone']],
['Date:', rx_date.strftime('%m/%d/%Y')],
]
annotations.extend([
{'type': 'name', 'value': patient_data['name'], 'page': 1, 'context': 'patient'},
{'type': 'date', 'value': patient_data['birth_date'], 'page': 1, 'context': 'dob'},
{'type': 'address', 'value': patient_data['address'], 'page': 1},
{'type': 'phone', 'value': patient_data['phone'], 'page': 1},
{'type': 'date', 'value': rx_date.strftime('%Y-%m-%d'), 'page': 1, 'context': 'rx_date'},
])
patient_table = Table(patient_info, colWidths=[1.5*inch, 5*inch])
patient_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 9),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#f0f0f0')),
]))
story.append(patient_table)
story.append(Spacer(1, 0.3*inch))
# RX symbol
story.append(Paragraph("℞", ParagraphStyle('Rx', fontSize=24, textColor=colors.HexColor('#003366'))))
story.append(Spacer(1, 0.1*inch))
# Medications
for i, med in enumerate(patient_data['medications'][:3], 1):
med_text = f"""
{i}. {med['name']}
Sig: {med['dosage']}
Disp: #{random.randint(30, 90)} ({random.choice(['thirty', 'sixty', 'ninety'])})
Refills: {random.randint(0, 5)}
Generic OK: {random.choice(['Yes', 'No - Brand Medically Necessary'])}
"""
story.append(Paragraph(med_text, self.body_style))
story.append(Spacer(1, 0.15*inch))
# Prescriber info
prescriber = f"Dr. {self.faker.name()}"
dea = f"DEA: {random.choice(['A', 'B', 'F'])}{self.faker.random_number(digits=7, fix_len=True)}"
npi = f"NPI: {self.faker.random_number(digits=10, fix_len=True)}"
license = f"License: {self.faker.random_number(digits=8, fix_len=True)}"
annotations.append({'type': 'name', 'value': prescriber, 'page': 1, 'context': 'prescriber'})
annotations.append({'type': 'license', 'value': dea, 'page': 1, 'context': 'dea'})
annotations.append({'type': 'unique_id', 'value': npi, 'page': 1, 'context': 'npi'})
annotations.append({'type': 'license', 'value': license, 'page': 1, 'context': 'state_license'})
story.append(Spacer(1, 0.3*inch))
prescriber_text = f"""
Prescriber: {prescriber}, MD
{dea} | {npi} | {license}
Signature: _____________________________ Date: {rx_date.strftime('%m/%d/%Y')}
"""
story.append(Paragraph(prescriber_text, self.body_style))
# Footer
story.append(Spacer(1, 0.2*inch))
footer = "This prescription is valid for one year from the date written unless otherwise specified."
story.append(Paragraph(footer, self.small_style))
doc.build(story)
return annotations
def create_lab_report(self, patient_data, output_path):
"""Create a realistic laboratory report."""
doc = SimpleDocTemplate(str(output_path), pagesize=letter,
topMargin=0.5*inch, bottomMargin=0.5*inch)
story = []
annotations = []
# Lab header
lab_name = f"{self.faker.company()} Clinical Laboratory"
lab_address = self.faker.address().replace('\n', ', ')
clia = f"CLIA #: {self.faker.random_number(digits=10, fix_len=True)}"
story.append(Paragraph(lab_name, self.facility_style))
story.append(Paragraph(lab_address, self.small_style))
story.append(Paragraph(f"{clia} | CAP Accredited", self.small_style))
story.append(Spacer(1, 0.2*inch))
story.append(Paragraph("LABORATORY REPORT", self.header_style))
story.append(Spacer(1, 0.15*inch))
# Patient and specimen info
collection_date = datetime.now() - timedelta(days=random.randint(1, 7))
report_date = datetime.now()
specimen_id = f"SPEC-{self.faker.random_number(digits=10, fix_len=True)}"
accession = f"ACC-{self.faker.random_number(digits=8, fix_len=True)}"
patient_info = [
['Patient Name:', patient_data['name'], 'Ordering Physician:', f"Dr. {self.faker.last_name()}"],
['DOB:', patient_data['birth_date'], 'Collected:', collection_date.strftime('%m/%d/%Y %H:%M')],
['MRN:', patient_data['mrn'], 'Received:', collection_date.strftime('%m/%d/%Y %H:%M')],
['SSN:', patient_data['ssn'], 'Reported:', report_date.strftime('%m/%d/%Y %H:%M')],
['Phone:', patient_data['phone'], 'Specimen ID:', specimen_id],
['', '', 'Accession:', accession],
]
annotations.extend([
{'type': 'name', 'value': patient_data['name'], 'page': 1},
{'type': 'date', 'value': patient_data['birth_date'], 'page': 1},
{'type': 'mrn', 'value': patient_data['mrn'], 'page': 1},
{'type': 'ssn', 'value': patient_data['ssn'], 'page': 1},
{'type': 'phone', 'value': patient_data['phone'], 'page': 1},
{'type': 'date', 'value': collection_date.strftime('%Y-%m-%d'), 'page': 1},
{'type': 'date', 'value': report_date.strftime('%Y-%m-%d'), 'page': 1},
])
info_table = Table(patient_info, colWidths=[1.3*inch, 2.2*inch, 1.3*inch, 2.2*inch])
info_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTNAME', (2, 0), (2, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.25, colors.grey),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#f0f0f0')),
('BACKGROUND', (2, 0), (2, -1), colors.HexColor('#f0f0f0')),
]))
story.append(info_table)
story.append(Spacer(1, 0.25*inch))
# Test results
story.append(Paragraph("COMPREHENSIVE METABOLIC PANEL", self.section_style))
test_data = [
['Test Name', 'Result', 'Units', 'Reference Range', 'Flag'],
['Glucose', str(random.randint(70, 130)), 'mg/dL', '70-99', random.choice(['', '', 'H'])],
['BUN', str(random.randint(7, 25)), 'mg/dL', '7-20', ''],
['Creatinine', str(round(random.uniform(0.6, 1.3), 1)), 'mg/dL', '0.7-1.3', ''],
['eGFR', str(random.randint(60, 120)), 'mL/min', '>60', ''],
['Sodium', str(random.randint(135, 145)), 'mEq/L', '136-144', ''],
['Potassium', str(round(random.uniform(3.5, 5.1), 1)), 'mEq/L', '3.5-5.0', random.choice(['', 'H'])],
['Chloride', str(random.randint(96, 106)), 'mEq/L', '96-106', ''],
['CO2', str(random.randint(22, 29)), 'mEq/L', '22-28', ''],
['Calcium', str(round(random.uniform(8.5, 10.5), 1)), 'mg/dL', '8.5-10.5', ''],
['Total Protein', str(round(random.uniform(6.0, 8.3), 1)), 'g/dL', '6.0-8.3', ''],
['Albumin', str(round(random.uniform(3.5, 5.0), 1)), 'g/dL', '3.5-5.0', ''],
['Bilirubin, Total', str(round(random.uniform(0.1, 1.2), 1)), 'mg/dL', '0.1-1.2', ''],
['Alk Phos', str(random.randint(30, 120)), 'IU/L', '30-120', ''],
['AST (SGOT)', str(random.randint(10, 40)), 'IU/L', '10-40', ''],
['ALT (SGPT)', str(random.randint(7, 56)), 'IU/L', '7-56', ''],
]
test_table = Table(test_data, colWidths=[2*inch, 0.8*inch, 0.8*inch, 1.5*inch, 0.6*inch])
test_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.25, colors.grey),
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#004080')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('ALIGN', (1, 1), (-1, -1), 'CENTER'),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
# Highlight abnormal values
('TEXTCOLOR', (4, 1), (4, -1), colors.red),
('FONTNAME', (4, 1), (4, -1), 'Helvetica-Bold'),
]))
story.append(test_table)
story.append(Spacer(1, 0.2*inch))
# CBC
story.append(Paragraph("COMPLETE BLOOD COUNT", self.section_style))
cbc_data = [
['Test Name', 'Result', 'Units', 'Reference Range', 'Flag'],
['WBC', str(round(random.uniform(4.0, 11.0), 1)), 'K/uL', '4.0-11.0', ''],
['RBC', str(round(random.uniform(4.2, 5.9), 2)), 'M/uL', '4.2-5.9', ''],
['Hemoglobin', str(round(random.uniform(12.0, 17.0), 1)), 'g/dL', '12.0-16.0', random.choice(['', 'H'])],
['Hematocrit', str(round(random.uniform(36.0, 48.0), 1)), '%', '36.0-46.0', ''],
['MCV', str(random.randint(80, 100)), 'fL', '80-100', ''],
['MCH', str(round(random.uniform(27.0, 34.0), 1)), 'pg', '27.0-34.0', ''],
['MCHC', str(round(random.uniform(32.0, 36.0), 1)), 'g/dL', '32.0-36.0', ''],
['Platelets', str(random.randint(150, 400)), 'K/uL', '150-400', ''],
]
cbc_table = Table(cbc_data, colWidths=[2*inch, 0.8*inch, 0.8*inch, 1.5*inch, 0.6*inch])
cbc_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.25, colors.grey),
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#004080')),
('TEXTCOLOR', (0, 0), (-1, 0), colors.white),
('ALIGN', (1, 1), (-1, -1), 'CENTER'),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
('TEXTCOLOR', (4, 1), (4, -1), colors.red),
('FONTNAME', (4, 1), (4, -1), 'Helvetica-Bold'),
]))
story.append(cbc_table)
story.append(Spacer(1, 0.2*inch))
# Pathologist signature
pathologist = f"Dr. {self.faker.name()}"
annotations.append({'type': 'name', 'value': pathologist, 'page': 1, 'context': 'pathologist'})
signature = f"""
Electronically signed by: {pathologist}, MD
Board Certified Clinical Pathologist
{report_date.strftime('%m/%d/%Y %H:%M')}
"""
story.append(Paragraph(signature, self.body_style))
# Footer
story.append(Spacer(1, 0.15*inch))
footer = "This report has been electronically signed. No signature required for legal validity."
story.append(Paragraph(footer, self.small_style))
doc.build(story)
return annotations
def create_insurance_claim(self, patient_data, output_path):
"""Create realistic CMS-1500 insurance claim form."""
doc = SimpleDocTemplate(str(output_path), pagesize=letter,
topMargin=0.3*inch, bottomMargin=0.3*inch)
story = []
annotations = []
# Form header
story.append(Paragraph("HEALTH INSURANCE CLAIM FORM", self.header_style))
story.append(Paragraph("(CMS-1500 - 02/12)", self.small_style))
story.append(Spacer(1, 0.15*inch))
# Insurance carrier info
carrier = f"{self.faker.company()} Health Insurance"
carrier_address = self.faker.address().replace('\n', ', ')
carrier_box = [
['CARRIER', carrier],
['Address', carrier_address],
]
carrier_table = Table(carrier_box, colWidths=[1*inch, 5.5*inch])
carrier_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.5, colors.black),
]))
story.append(carrier_table)
story.append(Spacer(1, 0.15*inch))
# Patient information
story.append(Paragraph("1-8. PATIENT INFORMATION", self.section_style))
patient_info = [
['1. Patient Name (Last, First, MI)', patient_data['name']],
['2. Patient Date of Birth', patient_data['birth_date']],
['3. Patient Sex', random.choice(['M', 'F'])],
['4. Insured Name', patient_data['name']],
['5. Patient Address', patient_data['address']],
['6. Patient City, State, ZIP', f"{self.faker.city()}, {self.faker.state_abbr()} {self.faker.postcode()}"],
['7. Patient Phone', patient_data['phone']],
['8. Patient Status', 'Single'],
]
annotations.extend([
{'type': 'name', 'value': patient_data['name'], 'page': 1},
{'type': 'date', 'value': patient_data['birth_date'], 'page': 1},
{'type': 'address', 'value': patient_data['address'], 'page': 1},
{'type': 'phone', 'value': patient_data['phone'], 'page': 1},
])
patient_table = Table(patient_info, colWidths=[2.5*inch, 4*inch])
patient_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.25, colors.grey),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
]))
story.append(patient_table)
story.append(Spacer(1, 0.15*inch))
# Insurance information
story.append(Paragraph("9-13. INSURANCE INFORMATION", self.section_style))
insurance_info = [
['9. Insured ID Number', patient_data['insurance_id']],
['10. Patient Relationship to Insured', 'Self'],
['11. Insured Group Number', f"GRP-{self.faker.random_number(digits=6, fix_len=True)}"],
['12. Insured Date of Birth', patient_data['birth_date']],
['13. Insured SSN', patient_data['ssn']],
]
annotations.extend([
{'type': 'insurance_id', 'value': patient_data['insurance_id'], 'page': 1},
{'type': 'ssn', 'value': patient_data['ssn'], 'page': 1},
])
insurance_table = Table(insurance_info, colWidths=[2.5*inch, 4*inch])
insurance_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.25, colors.grey),
]))
story.append(insurance_table)
story.append(Spacer(1, 0.15*inch))
# Provider information
story.append(Paragraph("14-23. PROVIDER INFORMATION", self.section_style))
provider_name = f"Dr. {self.faker.name()}"
provider_npi = f"{self.faker.random_number(digits=10, fix_len=True)}"
provider_tax_id = f"{self.faker.random_number(digits=2, fix_len=True)}-{self.faker.random_number(digits=7, fix_len=True)}"
facility_name = f"{self.faker.company()} Medical Center"
facility_address = self.faker.address().replace('\n', ', ')
annotations.extend([
{'type': 'name', 'value': provider_name, 'page': 1, 'context': 'provider'},
{'type': 'unique_id', 'value': provider_npi, 'page': 1, 'context': 'npi'},
{'type': 'institution', 'value': facility_name, 'page': 1},
{'type': 'address', 'value': facility_address, 'page': 1, 'context': 'facility'},
])
provider_info = [
['Rendering Provider', provider_name],
['NPI', provider_npi],
['Tax ID', provider_tax_id],
['Facility Name', facility_name],
['Facility Address', facility_address],
]
provider_table = Table(provider_info, colWidths=[2.5*inch, 4*inch])
provider_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -1), 0.25, colors.grey),
]))
story.append(provider_table)
story.append(Spacer(1, 0.15*inch))
# Service details
story.append(Paragraph("24. SERVICE DETAILS", self.section_style))
service_date = datetime.now() - timedelta(days=random.randint(1, 30))
annotations.append({'type': 'date', 'value': service_date.strftime('%Y-%m-%d'), 'page': 1, 'context': 'service'})
# Diagnosis codes
dx_codes = [
('J06.9', 'Acute upper respiratory infection'),
('R50.9', 'Fever, unspecified'),
('M25.511', 'Pain in right shoulder'),
]
# Service lines
service_data = [
['Date', 'CPT/HCPCS', 'Description', 'Units', 'Charge'],
[service_date.strftime('%m/%d/%Y'), '99213', 'Office Visit - Established', '1', '$150.00'],
[service_date.strftime('%m/%d/%Y'), '85025', 'Complete Blood Count', '1', '$45.00'],
[service_date.strftime('%m/%d/%Y'), '80053', 'Comprehensive Metabolic Panel', '1', '$85.00'],
['', '', '', 'TOTAL:', '$280.00'],
]
service_table = Table(service_data, colWidths=[1.2*inch, 1*inch, 2.5*inch, 0.7*inch, 1*inch])
service_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTNAME', (3, -1), (-1, -1), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BOX', (0, 0), (-1, -1), 1, colors.black),
('GRID', (0, 0), (-1, -2), 0.25, colors.grey),
('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#f0f0f0')),
('ALIGN', (3, 0), (-1, -1), 'RIGHT'),
]))
story.append(service_table)
story.append(Spacer(1, 0.2*inch))
# Signature
sig_date = datetime.now().strftime('%m/%d/%Y')
annotations.append({'type': 'date', 'value': datetime.now().strftime('%Y-%m-%d'), 'page': 1, 'context': 'signature'})
signature = f"""
I certify that the statements on this form are true and accurate.
Patient/Authorized Person Signature: _____________________ Date: {sig_date}
Provider Signature: _____________________ Date: {sig_date}
"""
story.append(Paragraph(signature, self.body_style))
doc.build(story)
return annotations
def main():
import argparse
parser = argparse.ArgumentParser(description='Generate REALISTIC medical PDFs')
parser.add_argument('--num-patients', type=int, default=100, help='Number of patients')
parser.add_argument('--num-documents', type=int, default=500, help='Number of PDFs')
parser.add_argument('--output-dir', type=str, default='./data/pdfs', help='Output directory')
parser.add_argument('--annotations-dir', type=str, default='./data/annotations', help='Annotations directory')
args = parser.parse_args()
print("="*70)
print("Generating REALISTIC Medical PDFs")
print("="*70)
print(f"Patients: {args.num_patients}")
print(f"Documents: {args.num_documents}")
print()
# Generate patient data
faker = Faker()
Faker.seed(42)
patients = []
for i in range(args.num_patients):
patient = {
'name': faker.name(),
'birth_date': faker.date_of_birth(minimum_age=18, maximum_age=90).strftime('%Y-%m-%d'),
'ssn': faker.ssn(),
'phone': faker.phone_number(),
'email': faker.email(),
'address': faker.address().replace('\n', ', '),
'mrn': f"MRN-{faker.random_number(digits=8, fix_len=True)}",
'insurance_id': f"{random.choice(['ABC', 'XYZ', 'DEF'])}{faker.random_number(digits=9, fix_len=True)}",
'medications': [
{'name': random.choice([
'Lisinopril 10mg Tablet',
'Metformin 500mg Tablet',
'Atorvastatin 20mg Tablet',
'Amlodipine 5mg Tablet',
'Omeprazole 20mg Capsule',
'Levothyroxine 50mcg Tablet',
'Albuterol 90mcg Inhaler',
'Gabapentin 300mg Capsule',
]),
'dosage': f'Take {random.randint(1, 2)} tablet(s) {random.choice(["once daily", "twice daily", "three times daily", "as needed"])}'}
for _ in range(random.randint(2, 5))
],
}
patients.append(patient)
print(f"✓ Generated {len(patients)} synthetic patients")
# Create generator
generator = RealisticMedicalPDFGenerator()
output_dir = Path(args.output_dir)
annotations_dir = Path(args.annotations_dir)
output_dir.mkdir(parents=True, exist_ok=True)
annotations_dir.mkdir(parents=True, exist_ok=True)
document_types = [
('prescription', generator.create_prescription),
('lab_report', generator.create_lab_report),
('insurance_claim', generator.create_insurance_claim),
]
print(f"\nGenerating {args.num_documents} realistic medical PDFs...")
for i in range(args.num_documents):
patient = random.choice(patients)
doc_type, create_func = random.choice(document_types)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
pdf_name = f"{doc_type}_{i:04d}_{timestamp}.pdf"
pdf_path = output_dir / pdf_name
# Create PDF
annotations = create_func(patient, pdf_path)
# Save annotations
annotation_file = annotations_dir / f"{pdf_name}.json"
with open(annotation_file, 'w') as f:
json.dump({
'document': pdf_name,
'annotations': annotations,
'timestamp': datetime.now().isoformat(),
'num_phi_items': len(annotations),
'phi_categories': list(set(a['type'] for a in annotations)),
}, f, indent=2)
if (i + 1) % 50 == 0:
print(f" Generated {i + 1}/{args.num_documents} PDFs...")
print(f"\n✓ Generation complete!")
print(f"\nGenerated files:")
print(f" {len(list(output_dir.glob('*.pdf')))} realistic medical PDFs")
print(f" {len(list(annotations_dir.glob('*.json')))} PHI annotation files")
print(f"\nFeatures:")
print(f" • Realistic medical formatting and layouts")
print(f" • Proper medical terminology and codes")
print(f" • Multiple PHI categories (names, dates, MRN, SSN, etc.)")
print(f" • Tables with lab results and service details")
print(f" • Professional headers, footers, and signatures")
print("="*70)
if __name__ == "__main__":
main()