I’m trying to write a code that will create a theological database for me. I’ll admit, I;’m using chatgpt.
Essentially, I want every PDF file downloaded to my downloads folder to be OCR and placed in a new folder that is nice and organized and in subfolders. The code is below. I’m not having luck. Can anyone help?
import os import shutil import pytesseract from pdf2image import convert_from_path from PyPDF2 import PdfReader # Paths DOWNLOADS_FOLDER = "/Users/guilhermelopes/Downloads" THEOLOGY_RESOURCES_FOLDER = "/Users/guilhermelopes/Documents/Theology Resources" REVIEW_FOLDER = "/Users/guilhermelopes/Desktop/PDF Review" # Expanded subfolders based on Catholic and theological themes TOPIC_FOLDERS = { "Old Testament": "Old Testament", "New Testament": "New Testament", "Papal Documents": "Papal Documents", "Franciscan": "Franciscan Charism", "Liturgy": "Theology of Liturgy", "Ethics": "Ethics and Social Justice", "Catholic Social Teaching": "Catholic Social Teaching", "Synodality": "Synodality", "Spirituality": "Spirituality", "Saints": "Lives of the Saints", "Apologetics": "Catholic Apologetics", "Church History": "Church History", "Scripture Study": "Scripture Study", "Canon Law": "Canon Law", "Mysticism": "Mysticism and Contemplation", "Gospel of John": "New Testament", "Ignatius Study Bible": "Scripture Study", } def extract_text_from_pdf(pdf_path): """Extracts text from the first few pages of the PDF.""" try: # Attempt to extract text using PyPDF2 reader = PdfReader(pdf_path) text = "" for page in reader.pages[:3]: # Scan first 3 pages text += page.extract_text() or "" if text.strip(): # Return text if extraction is successful return text except Exception as e: print(f"Error reading {pdf_path} with PyPDF2: {e}") # If no text found, fall back to OCR print(f"Falling back to OCR for {pdf_path}") return ocr_pdf(pdf_path) def ocr_pdf(pdf_path): """Performs OCR on a PDF using Tesseract.""" try: text = "" for i, page in enumerate(convert_from_path(pdf_path, dpi=300)): # Reduced DPI to 300 if i >= 3: # Process only the first 3 pages break text += pytesseract.image_to_string(page, lang="eng") print(f"OCR Extracted text (first 500 chars): {text[:500]}...") # Debug: Log OCR output return text except Exception as e: print(f"Error performing OCR on {pdf_path}: {e}") return "" def determine_topic(text): """Automatically determines the topic based on keywords.""" print("Analyzing text for topic...") # Debug for keyword, folder in TOPIC_FOLDERS.items(): if keyword.lower() in text.lower(): print(f"Matched keyword: {keyword}") # Debug return folder print("No match found for topic.") # Debug return None def auto_rename(filename): """Generates a default title, author, and date based on filename.""" base_name = os.path.splitext(os.path.basename(filename))[0] parts = base_name.split("-") title = parts[0].strip() if parts else "Unknown_Title" author = parts[1].strip() if len(parts) > 1 else "Unknown_Author" date = "2023-01-01" # Default date if not provided in text return title, author, date def process_pdf(file_path): """Processes a single PDF file.""" print(f"Processing: {file_path}") # Debug: Confirm file is being processed # Extract text (with OCR fallback) extracted_text = extract_text_from_pdf(file_path) print(f"Extracted text for topic matching (first 500 chars): {extracted_text[:500]}") # Debug # Determine topic topic = determine_topic(extracted_text) if not topic: print(f"No topic determined for {file_path}. Moving to Review Folder.") review_path = os.path.join(REVIEW_FOLDER, os.path.basename(file_path)) shutil.move(file_path, review_path) print(f"Moved to Review Folder: {review_path}") return print(f"File matched topic: {topic}") # Debug: Confirm matched topic # Auto-generate metadata for renaming title, author, date = auto_rename(file_path) # Rename and move to the appropriate folder target_folder = os.path.join(THEOLOGY_RESOURCES_FOLDER, topic) os.makedirs(target_folder, exist_ok=True) print(f"Moving file to: {target_folder}") # Debug new_file_path = rename_pdf(file_path, title, author, date) destination_path = os.path.join(target_folder, os.path.basename(new_file_path)) shutil.move(file_path, destination_path) print(f"Moved file to: {destination_path}") # Debug def rename_pdf(original_path, title, author, date): """Renames the PDF file to 'title_topic_date.pdf'.""" folder_name, file_name = os.path.split(original_path) new_file_name = f"{title}_{author}_{date}.pdf".replace(" ", "_") return os.path.join(folder_name, new_file_name) def sort_pdfs(): """Sorts and renames all PDFs in the Downloads folder and moves them to Theology Resources.""" print("Starting PDF sorting...") for root, dirs, files in os.walk(DOWNLOADS_FOLDER): # Walk through Downloads folder for filename in files: file_path = os.path.join(root, filename) if filename.endswith(".pdf"): print(f"Found PDF file: {file_path}") # Debug: Check if files are being detected process_pdf(file_path) else: print(f"Skipping non-PDF file: {file_path}") # Debug: Identify ignored files if __name__ == "__main__": # Ensure the review folder exists os.makedirs(REVIEW_FOLDER, exist_ok=True) # Start processing sort_pdfs()
submitted by /u/Busy-Lingonberry3382
[link] [comments]
r/learnpython I’m trying to write a code that will create a theological database for me. I’ll admit, I;’m using chatgpt. Essentially, I want every PDF file downloaded to my downloads folder to be OCR and placed in a new folder that is nice and organized and in subfolders. The code is below. I’m not having luck. Can anyone help? import os import shutil import pytesseract from pdf2image import convert_from_path from PyPDF2 import PdfReader # Paths DOWNLOADS_FOLDER = “/Users/guilhermelopes/Downloads” THEOLOGY_RESOURCES_FOLDER = “/Users/guilhermelopes/Documents/Theology Resources” REVIEW_FOLDER = “/Users/guilhermelopes/Desktop/PDF Review” # Expanded subfolders based on Catholic and theological themes TOPIC_FOLDERS = { “Old Testament”: “Old Testament”, “New Testament”: “New Testament”, “Papal Documents”: “Papal Documents”, “Franciscan”: “Franciscan Charism”, “Liturgy”: “Theology of Liturgy”, “Ethics”: “Ethics and Social Justice”, “Catholic Social Teaching”: “Catholic Social Teaching”, “Synodality”: “Synodality”, “Spirituality”: “Spirituality”, “Saints”: “Lives of the Saints”, “Apologetics”: “Catholic Apologetics”, “Church History”: “Church History”, “Scripture Study”: “Scripture Study”, “Canon Law”: “Canon Law”, “Mysticism”: “Mysticism and Contemplation”, “Gospel of John”: “New Testament”, “Ignatius Study Bible”: “Scripture Study”, } def extract_text_from_pdf(pdf_path): “””Extracts text from the first few pages of the PDF.””” try: # Attempt to extract text using PyPDF2 reader = PdfReader(pdf_path) text = “” for page in reader.pages[:3]: # Scan first 3 pages text += page.extract_text() or “” if text.strip(): # Return text if extraction is successful return text except Exception as e: print(f”Error reading {pdf_path} with PyPDF2: {e}”) # If no text found, fall back to OCR print(f”Falling back to OCR for {pdf_path}”) return ocr_pdf(pdf_path) def ocr_pdf(pdf_path): “””Performs OCR on a PDF using Tesseract.””” try: text = “” for i, page in enumerate(convert_from_path(pdf_path, dpi=300)): # Reduced DPI to 300 if i >= 3: # Process only the first 3 pages break text += pytesseract.image_to_string(page, lang=”eng”) print(f”OCR Extracted text (first 500 chars): {text[:500]}…”) # Debug: Log OCR output return text except Exception as e: print(f”Error performing OCR on {pdf_path}: {e}”) return “” def determine_topic(text): “””Automatically determines the topic based on keywords.””” print(“Analyzing text for topic…”) # Debug for keyword, folder in TOPIC_FOLDERS.items(): if keyword.lower() in text.lower(): print(f”Matched keyword: {keyword}”) # Debug return folder print(“No match found for topic.”) # Debug return None def auto_rename(filename): “””Generates a default title, author, and date based on filename.””” base_name = os.path.splitext(os.path.basename(filename))[0] parts = base_name.split(“-“) title = parts[0].strip() if parts else “Unknown_Title” author = parts[1].strip() if len(parts) > 1 else “Unknown_Author” date = “2023-01-01” # Default date if not provided in text return title, author, date def process_pdf(file_path): “””Processes a single PDF file.””” print(f”Processing: {file_path}”) # Debug: Confirm file is being processed # Extract text (with OCR fallback) extracted_text = extract_text_from_pdf(file_path) print(f”Extracted text for topic matching (first 500 chars): {extracted_text[:500]}”) # Debug # Determine topic topic = determine_topic(extracted_text) if not topic: print(f”No topic determined for {file_path}. Moving to Review Folder.”) review_path = os.path.join(REVIEW_FOLDER, os.path.basename(file_path)) shutil.move(file_path, review_path) print(f”Moved to Review Folder: {review_path}”) return print(f”File matched topic: {topic}”) # Debug: Confirm matched topic # Auto-generate metadata for renaming title, author, date = auto_rename(file_path) # Rename and move to the appropriate folder target_folder = os.path.join(THEOLOGY_RESOURCES_FOLDER, topic) os.makedirs(target_folder, exist_ok=True) print(f”Moving file to: {target_folder}”) # Debug new_file_path = rename_pdf(file_path, title, author, date) destination_path = os.path.join(target_folder, os.path.basename(new_file_path)) shutil.move(file_path, destination_path) print(f”Moved file to: {destination_path}”) # Debug def rename_pdf(original_path, title, author, date): “””Renames the PDF file to ‘title_topic_date.pdf’.””” folder_name, file_name = os.path.split(original_path) new_file_name = f”{title}_{author}_{date}.pdf”.replace(” “, “_”) return os.path.join(folder_name, new_file_name) def sort_pdfs(): “””Sorts and renames all PDFs in the Downloads folder and moves them to Theology Resources.””” print(“Starting PDF sorting…”) for root, dirs, files in os.walk(DOWNLOADS_FOLDER): # Walk through Downloads folder for filename in files: file_path = os.path.join(root, filename) if filename.endswith(“.pdf”): print(f”Found PDF file: {file_path}”) # Debug: Check if files are being detected process_pdf(file_path) else: print(f”Skipping non-PDF file: {file_path}”) # Debug: Identify ignored files if __name__ == “__main__”: # Ensure the review folder exists os.makedirs(REVIEW_FOLDER, exist_ok=True) # Start processing sort_pdfs() submitted by /u/Busy-Lingonberry3382 [link] [comments]
I’m trying to write a code that will create a theological database for me. I’ll admit, I;’m using chatgpt.
Essentially, I want every PDF file downloaded to my downloads folder to be OCR and placed in a new folder that is nice and organized and in subfolders. The code is below. I’m not having luck. Can anyone help?
import os import shutil import pytesseract from pdf2image import convert_from_path from PyPDF2 import PdfReader # Paths DOWNLOADS_FOLDER = "/Users/guilhermelopes/Downloads" THEOLOGY_RESOURCES_FOLDER = "/Users/guilhermelopes/Documents/Theology Resources" REVIEW_FOLDER = "/Users/guilhermelopes/Desktop/PDF Review" # Expanded subfolders based on Catholic and theological themes TOPIC_FOLDERS = { "Old Testament": "Old Testament", "New Testament": "New Testament", "Papal Documents": "Papal Documents", "Franciscan": "Franciscan Charism", "Liturgy": "Theology of Liturgy", "Ethics": "Ethics and Social Justice", "Catholic Social Teaching": "Catholic Social Teaching", "Synodality": "Synodality", "Spirituality": "Spirituality", "Saints": "Lives of the Saints", "Apologetics": "Catholic Apologetics", "Church History": "Church History", "Scripture Study": "Scripture Study", "Canon Law": "Canon Law", "Mysticism": "Mysticism and Contemplation", "Gospel of John": "New Testament", "Ignatius Study Bible": "Scripture Study", } def extract_text_from_pdf(pdf_path): """Extracts text from the first few pages of the PDF.""" try: # Attempt to extract text using PyPDF2 reader = PdfReader(pdf_path) text = "" for page in reader.pages[:3]: # Scan first 3 pages text += page.extract_text() or "" if text.strip(): # Return text if extraction is successful return text except Exception as e: print(f"Error reading {pdf_path} with PyPDF2: {e}") # If no text found, fall back to OCR print(f"Falling back to OCR for {pdf_path}") return ocr_pdf(pdf_path) def ocr_pdf(pdf_path): """Performs OCR on a PDF using Tesseract.""" try: text = "" for i, page in enumerate(convert_from_path(pdf_path, dpi=300)): # Reduced DPI to 300 if i >= 3: # Process only the first 3 pages break text += pytesseract.image_to_string(page, lang="eng") print(f"OCR Extracted text (first 500 chars): {text[:500]}...") # Debug: Log OCR output return text except Exception as e: print(f"Error performing OCR on {pdf_path}: {e}") return "" def determine_topic(text): """Automatically determines the topic based on keywords.""" print("Analyzing text for topic...") # Debug for keyword, folder in TOPIC_FOLDERS.items(): if keyword.lower() in text.lower(): print(f"Matched keyword: {keyword}") # Debug return folder print("No match found for topic.") # Debug return None def auto_rename(filename): """Generates a default title, author, and date based on filename.""" base_name = os.path.splitext(os.path.basename(filename))[0] parts = base_name.split("-") title = parts[0].strip() if parts else "Unknown_Title" author = parts[1].strip() if len(parts) > 1 else "Unknown_Author" date = "2023-01-01" # Default date if not provided in text return title, author, date def process_pdf(file_path): """Processes a single PDF file.""" print(f"Processing: {file_path}") # Debug: Confirm file is being processed # Extract text (with OCR fallback) extracted_text = extract_text_from_pdf(file_path) print(f"Extracted text for topic matching (first 500 chars): {extracted_text[:500]}") # Debug # Determine topic topic = determine_topic(extracted_text) if not topic: print(f"No topic determined for {file_path}. Moving to Review Folder.") review_path = os.path.join(REVIEW_FOLDER, os.path.basename(file_path)) shutil.move(file_path, review_path) print(f"Moved to Review Folder: {review_path}") return print(f"File matched topic: {topic}") # Debug: Confirm matched topic # Auto-generate metadata for renaming title, author, date = auto_rename(file_path) # Rename and move to the appropriate folder target_folder = os.path.join(THEOLOGY_RESOURCES_FOLDER, topic) os.makedirs(target_folder, exist_ok=True) print(f"Moving file to: {target_folder}") # Debug new_file_path = rename_pdf(file_path, title, author, date) destination_path = os.path.join(target_folder, os.path.basename(new_file_path)) shutil.move(file_path, destination_path) print(f"Moved file to: {destination_path}") # Debug def rename_pdf(original_path, title, author, date): """Renames the PDF file to 'title_topic_date.pdf'.""" folder_name, file_name = os.path.split(original_path) new_file_name = f"{title}_{author}_{date}.pdf".replace(" ", "_") return os.path.join(folder_name, new_file_name) def sort_pdfs(): """Sorts and renames all PDFs in the Downloads folder and moves them to Theology Resources.""" print("Starting PDF sorting...") for root, dirs, files in os.walk(DOWNLOADS_FOLDER): # Walk through Downloads folder for filename in files: file_path = os.path.join(root, filename) if filename.endswith(".pdf"): print(f"Found PDF file: {file_path}") # Debug: Check if files are being detected process_pdf(file_path) else: print(f"Skipping non-PDF file: {file_path}") # Debug: Identify ignored files if __name__ == "__main__": # Ensure the review folder exists os.makedirs(REVIEW_FOLDER, exist_ok=True) # Start processing sort_pdfs()
submitted by /u/Busy-Lingonberry3382
[link] [comments]