# ****************************************************************************** # FILENAME sgwire_isringhausen.py # # Version 7.0 # # Encoding UTF-8 # # DESCRIPTION Extract scheduled data from pdf:s to one calc (xlsx format) # # NOTES Menu language - Swedish # # Compiler python 3.10.12 running @ Ubuntu 22.04 LTS # # Copyright © 2023 All rights reserved Leif Krüger # #----------------------------------------------------------------------------- # CHANGES #----------------------------------------------------------------------------- # REF NO VERSION DATE WHO DETAIL # ----------------------------------------------------------------------------- # 1 1 2023-04-22 LK Start date # 2 2 2023-04-23 LK Extract data # 3 3 2023-04-23 LK Save to xlsx # 4 4 2023-04-23 LK Read all pdf:s in a subfolder "pdf" # 5 5 2023-04-25 LK Include T-items (Tage) as well # 6 6 2023-04-27 LK Add alternating row color in xlsx # 7 6.1 2023-04-29 LK A script for both linux/windows # 8 6.2 2023-04-30 LK The same order has the same bg-color # 9 6.3 2023-05-01 LK Weekorder rows more compact in code # 10 6.4 2023-05-01 LK Handle T and W in same code # 11 6.5 2023-05-07 LK Move settings to sgwire_config.cfg # 12 6.6 2023-06-25 LK Make more clean code + result info # 13 6.7 2023-08-01 LK Bug fix: Add missing "Tage" starts with T3x # 14 6.7 2023-08-01 LK Remove obsolete text see "material_text21" # 15 7.0 2023-08-02 LK Autofit the worksheet (column width) # ****************************************************************************** import PyPDF2 import xlsxwriter import os import glob import configparser ################################################################################ # Read config from file sgwire_config.cfg sgwire_config = configparser.RawConfigParser() sgwire_config.read("./sgwire_config.cfg") # Parameters workbook_name = sgwire_config.get("sgwire-config", "workbook_name") worksheet_name = sgwire_config.get("sgwire-config", "worksheet_name") output_filename = workbook_name + ".xlsx" output_file_path_windows = sgwire_config.get("sgwire-config", "output_file_path_windows") output_file_path_linux = sgwire_config.get("sgwire-config", "output_file_path_linux") path_pdf_windows = sgwire_config.get("sgwire-config", "path_pdf_windows") path_pdf_linux = sgwire_config.get("sgwire-config", "path_pdf_linux") output_striped = sgwire_config.get("sgwire-config", "output_striped") striped_by = sgwire_config.get("sgwire-config", "striped_by") cell_bg_header = sgwire_config.get("sgwire-config", "cell_bg_header") cell_bg_row_even = sgwire_config.get("sgwire-config", "cell_bg_row_even") cell_bg_row_odd = sgwire_config.get("sgwire-config", "cell_bg_row_odd") info_text_started = sgwire_config.get("sgwire-config", "info_text_started") info_text_finished = sgwire_config.get("sgwire-config", "info_text_finished") # Script data script_name = "sgwire_isringhausen.py" copyright_year = "2023" creator = "Leif Krüger" creator_email = "leif@leifkruger.se" script_version = "7.0" last_updated = "2023-08-02" # Clear screen def clearConsole(): command = "clear" if os.name in ("nt", "dos"): # If Machine is running on Windows, use cls command = "cls" os.system(command) clearConsole() # Take care of path depending on os, Linux or Windows if os.name in ("nt", "dos"): path_pdf = path_pdf_windows path_output = output_file_path_windows + output_filename else: path_pdf = path_pdf_linux path_output = output_file_path_linux + output_filename # Show start message print(f"\n\n\n\n\t\tScriptnamn: {script_name}") print(f"\n\n\n\n\t\tStatus: {info_text_started}") # Create a workbook and add a worksheet workbook = xlsxwriter.Workbook(path_output) worksheet = workbook.add_worksheet(worksheet_name) # Start from the first cell. Rows and columns are zero indexed row = 0 col = 0 # If output striped: Cell background color if output_striped == "yes": bg_format_header = workbook.add_format({'bg_color': cell_bg_header}) bg_format1 = workbook.add_format({'bg_color': cell_bg_row_odd}) bg_format2 = workbook.add_format({'bg_color': cell_bg_row_even}) else: # bg_format_header = workbook.add_format({'bg_color': cell_bg_header}) bg_format1 = workbook.add_format({'bg_color': '#FFFFFA'}) bg_format2 = workbook.add_format({'bg_color': '#FFFFFF'}) # Headers: attribut and backgroundcolor worksheet.write(row, col, "Purchase Order No", bg_format_header) worksheet.write(row, col + 1, "Customer Part No", bg_format_header) worksheet.write(row, col + 2, "Customer Part Description", bg_format_header) worksheet.write(row, col + 3, "Part Description En", bg_format_header) worksheet.write(row, col + 4, "Part Description En2", bg_format_header) worksheet.write(row, col + 5, "Day / Week", bg_format_header) worksheet.write(row, col + 6, "Open QTY", bg_format_header) worksheet.write(row, col + 7, "Change", bg_format_header) # New row after headers in excel file row += 1 # A list of all pdf files pdf_files = glob.glob(path_pdf) # Number of pdf files numberof_pdf_files = len(pdf_files) # Read all pdf files for x_pdf in range(numberof_pdf_files): # Open PDF file in read mode pdf_file = open(pdf_files[x_pdf], 'rb') # A object of PyPDF2 for the PDF file pdf_reader = PyPDF2.PdfReader(pdf_file) # Variable for extracted text from pdf pdf_text = "" # Loop through every pdf page and extract text for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] pdf_text += page.extract_text() # Close pdf file pdf_file.close() # Split a string into a list where each row is a list item text = pdf_text.split('\n') # Number of rows numberOfRows = (len(text)) # Use a flag to avoid dublicates duplicate_flag = 0 duplicate_flag2 = 0 duplicate_flag3 = 0 for x in range(numberOfRows): textrow = text[x] # Cell background color for each order/row by pdf if striped_by == "order": if x_pdf % 2 == 0: bgcell = bg_format1 # Odd else: bgcell = bg_format2 # Even elif striped_by == "row": if row % 2 == 0: bgcell = bg_format1 # Odd else: bgcell = bg_format2 # Even else: bgcell = bg_format1 if "Lieferplannummer/Datum" in textrow: if duplicate_flag == 0: y = x + 1 lieferplannummer_date = text[y] lieferplannummer = lieferplannummer_date[0:11] duplicate_flag = 1 if "Material " in textrow: material = text[x].replace("Material ", "") meterial_no = material[0:14] meterial_description = material[14:64] material_text20 = text[20] material_text21 = text[21].replace("______________________", "") material_text21 = material_text21.replace("ISRINGHAUSEN GmbH & Co. KG , 32655 Lemgo", "") # Take care of "T DD.MM.YY" after text "Liefertermin" if "Liefertermin" in textrow: duplicate_flag2 = 1 # Take care of "T DD.MM.YY" before text "Die bestellen Artikel" if "Die bestellen Artikel" in textrow: duplicate_flag3 = 1 # Take care of orderrow items "T" (Tage) and items "W" (Woche) if ((("T0" in textrow or "T1" in textrow or "T2" in textrow or "T3" in textrow) and (duplicate_flag2 == 1 and duplicate_flag3 == 0)) or ("W0" in textrow or "W1" in textrow or "W2" in textrow or "W3" in textrow or "W4" in textrow or "W5" in textrow)): tw_textrow = text[x] worksheet.write(row, col, lieferplannummer, bgcell) worksheet.write(row, col + 1, meterial_no, bgcell) worksheet.write(row, col + 2, meterial_description, bgcell) worksheet.write(row, col + 3, material_text20, bgcell) worksheet.write(row, col + 4, material_text21, bgcell) worksheet.write(row, col + 5, tw_textrow[0:12], bgcell) worksheet.write(row, col + 6, tw_textrow[23:35], bgcell) worksheet.write(row, col + 7, tw_textrow[41:52], bgcell) row += 1 # Autofit the worksheet worksheet.autofit() workbook.close() clearConsole() print(f"\n\n\n\n\t\tStatus: {info_text_finished}") print(f"\n\t\tStatistik: {numberof_pdf_files} st pdf:er genomsökta") print(f"\n\n\n\t\tSe resultat i fil: \n\n\t\t{path_output}") print("\n\n\n\n\n\t\t--- About the script -------------------------------------") print(f"\n\t\tScriptname: {script_name} \n\n\t\tScriptversion: {script_version}") print(f"\n\t\tLast updated: {last_updated}") print(f"\n\t\tCreator: {creator}") print(f"\n\t\tContact email: {creator_email}") print(f"\n\n\t\tCopyright © {copyright_year} All Rights Reserved {creator}\n\n\n")