美丽汤错误："openpyxl.utils.exceptions.illegalcharactererror“开发者社区

相关文章推荐

高兴的硬币 · mysql 锁 - dogRuning - 博客园· 1 月前 ·

不羁的四季豆 · python查询数据库某一列保存为list_ ...· 1 年前 ·

活泼的针织衫 · tsconfig.json中声明的角库路径别 ...· 1 年前 ·

谈吐大方的铁链 · docker oracle io ...· 1 年前 ·

强悍的牙膏 · grep 指定行范围-掘金· 1 年前 ·

腾讯云

开发者社区

文档意见反馈控制台

首页 TVP

文章/答案/技术大牛

发布

# install/import all prerequisites first
# from cgitb import text
from openpyxl import Workbook, load_workbook
from bs4 import BeautifulSoup
# create a question that asks how many files you have
i = 1
n = int(input("How many files ? "))
# final_n = n - 1
# the list of files
files = []
# the list of files only has 1 file contained by default
# while loop will create multiple files in the list so that I don't have to do the tedious work
while i <= n:
    files.append("folder/SplashBidNoticeAbstractUI (" + str(i) +").html")
    i = i+1
# load an existing Libreoffice Calc file
wb = Workbook()
ws = wb.active
ws.title = "Data"
# add the titles on the first row, each column with the respective title
ws.append(["DatePublished", "Closing Date", "Category", "Procuring Entity", "Approved Budget for the Contract", "Name", "Delivery Period", "Reference Number", "Title", "Area of Delivery", "Solicitation Number", "Contact"])
# the actual magic.
# extract desired data from the html files and then
# paste in the active Libreoffice Calc file
for i in files:
    with open(i, "r", errors="ignore") as html_file:
        content = html_file.read()  # does something
        soup = BeautifulSoup(content, "html.parser")  # does something
        # extracts data from the webpages
        if soup.find("span", id="lblDisplayDatePublish") != None:
            datePublished = soup.find("span", id="lblDisplayDatePublish").text
        else: datePublished = ""
        if soup.find("span", id="lblDisplayCloseDateTime") != None:
            cd  = soup.find("span", id="lblDisplayCloseDateTime").text
        else: cd = ""
        if soup.find("span", id="lblDisplayCategory") != None:
            cat = soup.find("span", id="lblDisplayCategory").text
        else: cat = ""
        if soup.find("span", id="lblDisplayProcuringEntity") != None:
            pro_id = soup.find("span", id="lblDisplayProcuringEntity").text.replace("", "")
        else: pro_id = ""
        if soup.find("span", id="lblDisplayBudget") != None:
            abc = soup.find("span", id="lblDisplayBudget").text
        else: abc = ""
        if soup.find("span", id="lblHeader") != None:
            name = soup.find("span", id="lblHeader").text.replace(" ", "_").replace("\n", "_")
        else: name = ""
        if soup.find("span", id="lblDisplayPeriod") != None:
            delp = soup.find("span", id="lblDisplayPeriod").text
        else: delp = ""
        if soup.find("span", id="lblDisplayReferenceNo")!= None:
            ref_num = soup.find("span", id="lblDisplayReferenceNo").text
        else: ref_num = ""
        if soup.find("span", id="lblDisplayTitle")!= None:
            title = soup.find("span", id="lblDisplayTitle").text.replace(" ", "_").replace("\n", "_")
        else: title = ""
        if soup.find("span", id="lblDisplayAOD") != None:
            aod = soup.find("span", id="lblDisplayAOD").text.replace("\n", "_")
        else: aod = ""
        if soup.find("span", id="lblDisplaySolNumber") != None:
            solNr = soup.find("span", id="lblDisplaySolNumber").text
        else: solNr = ""
        if soup.find("span", id="lblDisplayContactPerson")!= None:
            contact = soup.find("span", id="lblDisplayContactPerson").text
        else: contact = ""
        # just an assurance that the code worked and nothing screwed up
        print("\nBid" + i)
        print("Date Published: " + datePublished)
        print("Closing Date: " + cd)
        print("Category : " + cat)
        print("Procurement Entity : " + pro_id)
        print("Name: " + name)
        print("Delivery Period: " + delp)
        print("ABC: " + abc)
        print("Reference Number : " + ref_num)
        print("Title : " + title)
        print("Area of Delivery : " + aod)
        print("Solicitation Number: "+ solNr)
        print("Contact: "+ contact)
        # pastes the data inside the calc file under the titles
        ws.append([datePublished, cd, cat, pro_id, abc, name, delp, ref_num, title, aod, solNr, contact])
# saves file so work is safe and sound
filename = input("filename: ") 
wb.save(filename + ".xlsx")