# install/import all prerequisites first
# from cgitb import text
from openpyxl import Workbook, load_workbook
from bs4 import BeautifulSoup
# create a question that asks how many files you have
i = 1
n = int(input("How many files ? "))
# final_n = n - 1
# the list of files
files = []
# the list of files only has 1 file contained by default
# while loop will create multiple files in the list so that I don't have to do the tedious work
while i <= n:
files.append("folder/SplashBidNoticeAbstractUI (" + str(i) +").html")
i = i+1
# load an existing Libreoffice Calc file
wb = Workbook()
ws = wb.active
ws.title = "Data"
# add the titles on the first row, each column with the respective title
ws.append(["DatePublished", "Closing Date", "Category", "Procuring Entity", "Approved Budget for the Contract", "Name", "Delivery Period", "Reference Number", "Title", "Area of Delivery", "Solicitation Number", "Contact"])
# the actual magic.
# extract desired data from the html files and then
# paste in the active Libreoffice Calc file
for i in files:
with open(i, "r", errors="ignore") as html_file:
content = html_file.read() # does something
soup = BeautifulSoup(content, "html.parser") # does something
# extracts data from the webpages
if soup.find("span", id="lblDisplayDatePublish") != None:
datePublished = soup.find("span", id="lblDisplayDatePublish").text
else: datePublished = ""
if soup.find("span", id="lblDisplayCloseDateTime") != None:
cd = soup.find("span", id="lblDisplayCloseDateTime").text
else: cd = ""
if soup.find("span", id="lblDisplayCategory") != None:
cat = soup.find("span", id="lblDisplayCategory").text
else: cat = ""
if soup.find("span", id="lblDisplayProcuringEntity") != None:
pro_id = soup.find("span", id="lblDisplayProcuringEntity").text.replace("", "")
else: pro_id = ""
if soup.find("span", id="lblDisplayBudget") != None:
abc = soup.find("span", id="lblDisplayBudget").text
else: abc = ""
if soup.find("span", id="lblHeader") != None:
name = soup.find("span", id="lblHeader").text.replace(" ", "_").replace("\n", "_")
else: name = ""
if soup.find("span", id="lblDisplayPeriod") != None:
delp = soup.find("span", id="lblDisplayPeriod").text
else: delp = ""
if soup.find("span", id="lblDisplayReferenceNo")!= None:
ref_num = soup.find("span", id="lblDisplayReferenceNo").text
else: ref_num = ""
if soup.find("span", id="lblDisplayTitle")!= None:
title = soup.find("span", id="lblDisplayTitle").text.replace(" ", "_").replace("\n", "_")
else: title = ""
if soup.find("span", id="lblDisplayAOD") != None:
aod = soup.find("span", id="lblDisplayAOD").text.replace("\n", "_")
else: aod = ""
if soup.find("span", id="lblDisplaySolNumber") != None:
solNr = soup.find("span", id="lblDisplaySolNumber").text
else: solNr = ""
if soup.find("span", id="lblDisplayContactPerson")!= None:
contact = soup.find("span", id="lblDisplayContactPerson").text
else: contact = ""
# just an assurance that the code worked and nothing screwed up
print("\nBid" + i)
print("Date Published: " + datePublished)
print("Closing Date: " + cd)
print("Category : " + cat)
print("Procurement Entity : " + pro_id)
print("Name: " + name)
print("Delivery Period: " + delp)
print("ABC: " + abc)
print("Reference Number : " + ref_num)
print("Title : " + title)
print("Area of Delivery : " + aod)
print("Solicitation Number: "+ solNr)
print("Contact: "+ contact)
# pastes the data inside the calc file under the titles
ws.append([datePublished, cd, cat, pro_id, abc, name, delp, ref_num, title, aod, solNr, contact])
# saves file so work is safe and sound
filename = input("filename: ")
wb.save(filename + ".xlsx")