File Renaming, Tesseract-OCR File formats PDF, JPG, TIF. Can’t get Tesseract to work /u/No-Morning2465 Python Education

Good Morning, community,

I’ve been working on a solution to rename all of my pdf files with a date format YYYY-MM-DD, so far I’ve managed to rename about 750 documents, I still have a large amount of pdf files where there’s a date in the ocr text, but for some reason I’m unable to pick them out. I’m now trying to go one stop further and get the program Tesseract-OCR to work on pdf, .jpg and tif files.

PyCharm is saying that I have all of the packages installed. I’ve also added the C:Program FilesTesseract-OCR to system path variables.

When I open a terminal window to run tesseract --version I’m getting a error message “tesseract : The term ‘tesseract’ is not recognized as the name of a cmdlet, function, script file, or operable program. Check the spelling of the name, or if a path was included, verify that the path is correct and try again. At line:1 char:1 + tesseract –version + ~~~~~~~~~ + CategoryInfo : ObjectNotFound: (tesseract:String) [], CommandNotFoundException + FullyQualifiedErrorId : CommandNotFoundException”

I know my code will not be perfect, I’ve only being playing around with Python for a couple of months.

Hopefully I’ve posted enough information and in the correct format and that someone within the community can advise where I’m going wrong. I have attached a copy of my code for reference.

Look forward to hearing from you soon.

import pdfplumber import re import os from datetime import datetime from PIL import Image import pytesseract import logging # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def extract_date_from_pdf(pdf_path): date_pattern = re.compile( r'(d{4}[-/]d{2}[-/]d{2})|' # YYYY-MM-DD or YYYY/MM/DD r'(d{2}[-/]d{2}[-/]d{4})|' # MM-DD-YYYY or MM/DD/YYYY r'(d{1,2} w+ d{4})|' # 1st January 2024, 01 January 2024 r'(d{1,2} w+ d{2})|' # 13 June 22 r'(d{2}-d{2}-d{2})|' # 26-11-24 r'(d{2}-d{2}-d{4})|' # 26-11-2024 r'(w+ d{4})|' # June 2024 r'(d{2} w{3} d{4})|' # 26 Nov 2024 r'(d{2}-w{3}-d{4})|' # 26-Nov-2024 r'(d{2} w{3} d{4} to d{2} w{3} d{4})|' # 15 Oct 2020 to 14 Oct 2021 r'(d{2} w{3} - d{2} w{3} d{4})|' # 22 Aug - 21 Sep 2023 r'(Date: d{2}/d{2}/d{2})|' # Date: 17/02/17 r'(d{2}/d{2}/d{2})|' # 17/02/17 r'(d{2}/d{2}/d{4})' # 17/02/2017 ) date = None try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() match = date_pattern.search(text) if match: date = match.group() break except Exception as e: logging.error(f"Error opening {pdf_path}: {e}") return date def extract_date_from_image(image_path): date_pattern = re.compile( r'(d{4}[-/]d{2}[-/]d{2})|' # YYYY-MM-DD or YYYY/MM/DD r'(d{2}[-/]d{2}[-/]d{4})|' # MM-DD-YYYY or MM/DD/YYYY r'(d{1,2} w+ d{4})|' # 1st January 2024, 01 January 2024 r'(d{1,2} w+ d{2})|' # 13 June 22 r'(d{2}-d{2}-d{2})|' # 26-11-24 r'(d{2}-d{2}-d{4})|' # 26-11-2024 r'(w+ d{4})|' # June 2024 r'(d{2} w{3} d{4})|' # 26 Nov 2024 r'(d{2}-w{3}-d{4})|' # 26-Nov-2024 r'(d{2} w{3} d{4} to d{2} w{3} d{4})|' # 15 Oct 2020 to 14 Oct 2021 r'(d{2} w{3} - d{2} w{3} d{4})|' # 22 Aug - 21 Sep 2023 r'(Date: d{2}/d{2}/d{2})|' # Date: 17/02/17 r'(d{2}/d{2}/d{2})|' # 17/02/17 r'(d{2}/d{2}/d{4})' # 17/02/2017 ) date = None try: image = Image.open(image_path) text = pytesseract.image_to_string(image) match = date_pattern.search(text) if match: date = match.group() except Exception as e: logging.error(f"Error opening {image_path}: {e}") return date def normalize_date(date_str): try: if " to " in date_str: start_date_str, end_date_str = date_str.split(" to ") start_date = normalize_date(start_date_str.strip()) end_date = normalize_date(end_date_str.strip()) return f"{start_date}_to_{end_date}" elif " - " in date_str: start_date_str, end_date_str, year_str = date_str.split(" ")[0], date_str.split(" ")[2], date_str.split(" ")[-1] start_date = normalize_date(f"{start_date_str} {year_str}") end_date = normalize_date(f"{end_date_str} {year_str}") return f"{start_date}_to_{end_date}" elif "Date: " in date_str: date_str = date_str.replace("Date: ", "") for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m-%d-%Y", "%m/%d/%Y", "%d-%m-%Y", "%d/%m/%Y", "%d %B %Y", "%d %b %y", "%d-%m-%y", "%B %Y", "%d %b %Y", "%d-%b-%Y", "%d/%m/%y", "%Y"): try: date_obj = datetime.strptime(date_str, fmt) if fmt == "%B %Y": return date_obj.strftime("%Y-%m") + "-01" elif fmt == "%Y": return date_obj.strftime("%Y") return date_obj.strftime("%Y-%m-%d") except ValueError: continue raise ValueError(f"Date format not recognized: {date_str}") except Exception as e: logging.error(f"Error normalizing date: {e}") return None def rename_files(directory): for root, _, files in os.walk(directory): for filename in files: if filename.endswith((".pdf", ".jpg", ".tif")): if re.match(r'd{4}-d{2}-d{2}', filename): continue file_path = os.path.join(root, filename) date = None if filename.endswith(".pdf"): date = extract_date_from_pdf(file_path) elif filename.endswith((".jpg", ".jpeg", ".tif", ".tiff")): date = extract_date_from_image(file_path) if date: normalized_date = normalize_date(date) if normalized_date: new_filename = f"{normalized_date}_{filename}" new_file_path = os.path.join(root, new_filename) try: os.rename(file_path, new_file_path) logging.info(f"Renamed {filename} to {new_filename}") except Exception as e: logging.error(f"Error renaming {filename}: {e}") else: logging.warning(f"Could not normalize date found in {filename}") else: logging.warning(f"Date not found in {filename}") if __name__ == "__main__": directory = "F:/Documents/Scanning/AA Master Cabinet/Bills - Gas" rename_files(directory) logging.info("Done!") 

2024-12-19 09:00:09,837 – WARNING – Date not found in Scan2009-01-17 1943.tif

2024-12-19 09:00:09,995 – ERROR – Error normalizing date: Date format not recognized: number 0415

2024-12-19 09:00:09,995 – WARNING – Could not normalize date found in Scan2009-01-17 19430001.pdf

2024-12-19 09:00:10,042 – ERROR – Error opening F:/Documents/Scanning/AA Master Filing Cabinets Scanned/Bills – GasScan2009-01-17 19430001.tif: tesseract is not installed or it’s not in your PATH. See README file for more information.

2024-12-19 09:00:10,345 – INFO – Done!

Process finished with exit code 0

submitted by /u/No-Morning2465
[link] [comments]

​r/learnpython Good Morning, community, I’ve been working on a solution to rename all of my pdf files with a date format YYYY-MM-DD, so far I’ve managed to rename about 750 documents, I still have a large amount of pdf files where there’s a date in the ocr text, but for some reason I’m unable to pick them out. I’m now trying to go one stop further and get the program Tesseract-OCR to work on pdf, .jpg and tif files. PyCharm is saying that I have all of the packages installed. I’ve also added the C:Program FilesTesseract-OCR to system path variables. When I open a terminal window to run tesseract –version I’m getting a error message “tesseract : The term ‘tesseract’ is not recognized as the name of a cmdlet, function, script file, or operable program. Check the spelling of the name, or if a path was included, verify that the path is correct and try again. At line:1 char:1 + tesseract –version + ~~~~~~~~~ + CategoryInfo : ObjectNotFound: (tesseract:String) [], CommandNotFoundException + FullyQualifiedErrorId : CommandNotFoundException” I know my code will not be perfect, I’ve only being playing around with Python for a couple of months. Hopefully I’ve posted enough information and in the correct format and that someone within the community can advise where I’m going wrong. I have attached a copy of my code for reference. Look forward to hearing from you soon. import pdfplumber import re import os from datetime import datetime from PIL import Image import pytesseract import logging # Set up logging logging.basicConfig(level=logging.INFO, format=’%(asctime)s – %(levelname)s – %(message)s’) def extract_date_from_pdf(pdf_path): date_pattern = re.compile( r'(d{4}[-/]d{2}[-/]d{2})|’ # YYYY-MM-DD or YYYY/MM/DD r'(d{2}[-/]d{2}[-/]d{4})|’ # MM-DD-YYYY or MM/DD/YYYY r'(d{1,2} w+ d{4})|’ # 1st January 2024, 01 January 2024 r'(d{1,2} w+ d{2})|’ # 13 June 22 r'(d{2}-d{2}-d{2})|’ # 26-11-24 r'(d{2}-d{2}-d{4})|’ # 26-11-2024 r'(w+ d{4})|’ # June 2024 r'(d{2} w{3} d{4})|’ # 26 Nov 2024 r'(d{2}-w{3}-d{4})|’ # 26-Nov-2024 r'(d{2} w{3} d{4} to d{2} w{3} d{4})|’ # 15 Oct 2020 to 14 Oct 2021 r'(d{2} w{3} – d{2} w{3} d{4})|’ # 22 Aug – 21 Sep 2023 r'(Date: d{2}/d{2}/d{2})|’ # Date: 17/02/17 r'(d{2}/d{2}/d{2})|’ # 17/02/17 r'(d{2}/d{2}/d{4})’ # 17/02/2017 ) date = None try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() match = date_pattern.search(text) if match: date = match.group() break except Exception as e: logging.error(f”Error opening {pdf_path}: {e}”) return date def extract_date_from_image(image_path): date_pattern = re.compile( r'(d{4}[-/]d{2}[-/]d{2})|’ # YYYY-MM-DD or YYYY/MM/DD r'(d{2}[-/]d{2}[-/]d{4})|’ # MM-DD-YYYY or MM/DD/YYYY r'(d{1,2} w+ d{4})|’ # 1st January 2024, 01 January 2024 r'(d{1,2} w+ d{2})|’ # 13 June 22 r'(d{2}-d{2}-d{2})|’ # 26-11-24 r'(d{2}-d{2}-d{4})|’ # 26-11-2024 r'(w+ d{4})|’ # June 2024 r'(d{2} w{3} d{4})|’ # 26 Nov 2024 r'(d{2}-w{3}-d{4})|’ # 26-Nov-2024 r'(d{2} w{3} d{4} to d{2} w{3} d{4})|’ # 15 Oct 2020 to 14 Oct 2021 r'(d{2} w{3} – d{2} w{3} d{4})|’ # 22 Aug – 21 Sep 2023 r'(Date: d{2}/d{2}/d{2})|’ # Date: 17/02/17 r'(d{2}/d{2}/d{2})|’ # 17/02/17 r'(d{2}/d{2}/d{4})’ # 17/02/2017 ) date = None try: image = Image.open(image_path) text = pytesseract.image_to_string(image) match = date_pattern.search(text) if match: date = match.group() except Exception as e: logging.error(f”Error opening {image_path}: {e}”) return date def normalize_date(date_str): try: if ” to ” in date_str: start_date_str, end_date_str = date_str.split(” to “) start_date = normalize_date(start_date_str.strip()) end_date = normalize_date(end_date_str.strip()) return f”{start_date}_to_{end_date}” elif ” – ” in date_str: start_date_str, end_date_str, year_str = date_str.split(” “)[0], date_str.split(” “)[2], date_str.split(” “)[-1] start_date = normalize_date(f”{start_date_str} {year_str}”) end_date = normalize_date(f”{end_date_str} {year_str}”) return f”{start_date}_to_{end_date}” elif “Date: ” in date_str: date_str = date_str.replace(“Date: “, “”) for fmt in (“%Y-%m-%d”, “%Y/%m/%d”, “%m-%d-%Y”, “%m/%d/%Y”, “%d-%m-%Y”, “%d/%m/%Y”, “%d %B %Y”, “%d %b %y”, “%d-%m-%y”, “%B %Y”, “%d %b %Y”, “%d-%b-%Y”, “%d/%m/%y”, “%Y”): try: date_obj = datetime.strptime(date_str, fmt) if fmt == “%B %Y”: return date_obj.strftime(“%Y-%m”) + “-01” elif fmt == “%Y”: return date_obj.strftime(“%Y”) return date_obj.strftime(“%Y-%m-%d”) except ValueError: continue raise ValueError(f”Date format not recognized: {date_str}”) except Exception as e: logging.error(f”Error normalizing date: {e}”) return None def rename_files(directory): for root, _, files in os.walk(directory): for filename in files: if filename.endswith((“.pdf”, “.jpg”, “.tif”)): if re.match(r’d{4}-d{2}-d{2}’, filename): continue file_path = os.path.join(root, filename) date = None if filename.endswith(“.pdf”): date = extract_date_from_pdf(file_path) elif filename.endswith((“.jpg”, “.jpeg”, “.tif”, “.tiff”)): date = extract_date_from_image(file_path) if date: normalized_date = normalize_date(date) if normalized_date: new_filename = f”{normalized_date}_{filename}” new_file_path = os.path.join(root, new_filename) try: os.rename(file_path, new_file_path) logging.info(f”Renamed {filename} to {new_filename}”) except Exception as e: logging.error(f”Error renaming {filename}: {e}”) else: logging.warning(f”Could not normalize date found in {filename}”) else: logging.warning(f”Date not found in {filename}”) if __name__ == “__main__”: directory = “F:/Documents/Scanning/AA Master Cabinet/Bills – Gas” rename_files(directory) logging.info(“Done!”) 2024-12-19 09:00:09,837 – WARNING – Date not found in Scan2009-01-17 1943.tif 2024-12-19 09:00:09,995 – ERROR – Error normalizing date: Date format not recognized: number 0415 2024-12-19 09:00:09,995 – WARNING – Could not normalize date found in Scan2009-01-17 19430001.pdf 2024-12-19 09:00:10,042 – ERROR – Error opening F:/Documents/Scanning/AA Master Filing Cabinets Scanned/Bills – GasScan2009-01-17 19430001.tif: tesseract is not installed or it’s not in your PATH. See README file for more information. 2024-12-19 09:00:10,345 – INFO – Done! Process finished with exit code 0 submitted by /u/No-Morning2465 [link] [comments] 

Good Morning, community,

I’ve been working on a solution to rename all of my pdf files with a date format YYYY-MM-DD, so far I’ve managed to rename about 750 documents, I still have a large amount of pdf files where there’s a date in the ocr text, but for some reason I’m unable to pick them out. I’m now trying to go one stop further and get the program Tesseract-OCR to work on pdf, .jpg and tif files.

PyCharm is saying that I have all of the packages installed. I’ve also added the C:Program FilesTesseract-OCR to system path variables.

When I open a terminal window to run tesseract --version I’m getting a error message “tesseract : The term ‘tesseract’ is not recognized as the name of a cmdlet, function, script file, or operable program. Check the spelling of the name, or if a path was included, verify that the path is correct and try again. At line:1 char:1 + tesseract –version + ~~~~~~~~~ + CategoryInfo : ObjectNotFound: (tesseract:String) [], CommandNotFoundException + FullyQualifiedErrorId : CommandNotFoundException”

I know my code will not be perfect, I’ve only being playing around with Python for a couple of months.

Hopefully I’ve posted enough information and in the correct format and that someone within the community can advise where I’m going wrong. I have attached a copy of my code for reference.

Look forward to hearing from you soon.

import pdfplumber import re import os from datetime import datetime from PIL import Image import pytesseract import logging # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def extract_date_from_pdf(pdf_path): date_pattern = re.compile( r'(d{4}[-/]d{2}[-/]d{2})|' # YYYY-MM-DD or YYYY/MM/DD r'(d{2}[-/]d{2}[-/]d{4})|' # MM-DD-YYYY or MM/DD/YYYY r'(d{1,2} w+ d{4})|' # 1st January 2024, 01 January 2024 r'(d{1,2} w+ d{2})|' # 13 June 22 r'(d{2}-d{2}-d{2})|' # 26-11-24 r'(d{2}-d{2}-d{4})|' # 26-11-2024 r'(w+ d{4})|' # June 2024 r'(d{2} w{3} d{4})|' # 26 Nov 2024 r'(d{2}-w{3}-d{4})|' # 26-Nov-2024 r'(d{2} w{3} d{4} to d{2} w{3} d{4})|' # 15 Oct 2020 to 14 Oct 2021 r'(d{2} w{3} - d{2} w{3} d{4})|' # 22 Aug - 21 Sep 2023 r'(Date: d{2}/d{2}/d{2})|' # Date: 17/02/17 r'(d{2}/d{2}/d{2})|' # 17/02/17 r'(d{2}/d{2}/d{4})' # 17/02/2017 ) date = None try: with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: text = page.extract_text() match = date_pattern.search(text) if match: date = match.group() break except Exception as e: logging.error(f"Error opening {pdf_path}: {e}") return date def extract_date_from_image(image_path): date_pattern = re.compile( r'(d{4}[-/]d{2}[-/]d{2})|' # YYYY-MM-DD or YYYY/MM/DD r'(d{2}[-/]d{2}[-/]d{4})|' # MM-DD-YYYY or MM/DD/YYYY r'(d{1,2} w+ d{4})|' # 1st January 2024, 01 January 2024 r'(d{1,2} w+ d{2})|' # 13 June 22 r'(d{2}-d{2}-d{2})|' # 26-11-24 r'(d{2}-d{2}-d{4})|' # 26-11-2024 r'(w+ d{4})|' # June 2024 r'(d{2} w{3} d{4})|' # 26 Nov 2024 r'(d{2}-w{3}-d{4})|' # 26-Nov-2024 r'(d{2} w{3} d{4} to d{2} w{3} d{4})|' # 15 Oct 2020 to 14 Oct 2021 r'(d{2} w{3} - d{2} w{3} d{4})|' # 22 Aug - 21 Sep 2023 r'(Date: d{2}/d{2}/d{2})|' # Date: 17/02/17 r'(d{2}/d{2}/d{2})|' # 17/02/17 r'(d{2}/d{2}/d{4})' # 17/02/2017 ) date = None try: image = Image.open(image_path) text = pytesseract.image_to_string(image) match = date_pattern.search(text) if match: date = match.group() except Exception as e: logging.error(f"Error opening {image_path}: {e}") return date def normalize_date(date_str): try: if " to " in date_str: start_date_str, end_date_str = date_str.split(" to ") start_date = normalize_date(start_date_str.strip()) end_date = normalize_date(end_date_str.strip()) return f"{start_date}_to_{end_date}" elif " - " in date_str: start_date_str, end_date_str, year_str = date_str.split(" ")[0], date_str.split(" ")[2], date_str.split(" ")[-1] start_date = normalize_date(f"{start_date_str} {year_str}") end_date = normalize_date(f"{end_date_str} {year_str}") return f"{start_date}_to_{end_date}" elif "Date: " in date_str: date_str = date_str.replace("Date: ", "") for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%m-%d-%Y", "%m/%d/%Y", "%d-%m-%Y", "%d/%m/%Y", "%d %B %Y", "%d %b %y", "%d-%m-%y", "%B %Y", "%d %b %Y", "%d-%b-%Y", "%d/%m/%y", "%Y"): try: date_obj = datetime.strptime(date_str, fmt) if fmt == "%B %Y": return date_obj.strftime("%Y-%m") + "-01" elif fmt == "%Y": return date_obj.strftime("%Y") return date_obj.strftime("%Y-%m-%d") except ValueError: continue raise ValueError(f"Date format not recognized: {date_str}") except Exception as e: logging.error(f"Error normalizing date: {e}") return None def rename_files(directory): for root, _, files in os.walk(directory): for filename in files: if filename.endswith((".pdf", ".jpg", ".tif")): if re.match(r'd{4}-d{2}-d{2}', filename): continue file_path = os.path.join(root, filename) date = None if filename.endswith(".pdf"): date = extract_date_from_pdf(file_path) elif filename.endswith((".jpg", ".jpeg", ".tif", ".tiff")): date = extract_date_from_image(file_path) if date: normalized_date = normalize_date(date) if normalized_date: new_filename = f"{normalized_date}_{filename}" new_file_path = os.path.join(root, new_filename) try: os.rename(file_path, new_file_path) logging.info(f"Renamed {filename} to {new_filename}") except Exception as e: logging.error(f"Error renaming {filename}: {e}") else: logging.warning(f"Could not normalize date found in {filename}") else: logging.warning(f"Date not found in {filename}") if __name__ == "__main__": directory = "F:/Documents/Scanning/AA Master Cabinet/Bills - Gas" rename_files(directory) logging.info("Done!") 

2024-12-19 09:00:09,837 – WARNING – Date not found in Scan2009-01-17 1943.tif

2024-12-19 09:00:09,995 – ERROR – Error normalizing date: Date format not recognized: number 0415

2024-12-19 09:00:09,995 – WARNING – Could not normalize date found in Scan2009-01-17 19430001.pdf

2024-12-19 09:00:10,042 – ERROR – Error opening F:/Documents/Scanning/AA Master Filing Cabinets Scanned/Bills – GasScan2009-01-17 19430001.tif: tesseract is not installed or it’s not in your PATH. See README file for more information.

2024-12-19 09:00:10,345 – INFO – Done!

Process finished with exit code 0

submitted by /u/No-Morning2465
[link] [comments] 

Leave a Reply

Your email address will not be published. Required fields are marked *