Python：提取word中关键内容并导出到其它word和excel表格中_python 会议纪要导出word_cyber_1987的博客

1、会议通知

右下角是会议通知时间，根据会议时间往前倒推两天自动生成。
2、会议总表

二、主要难点

1、原来的文件都是doc格式的，python的docx库不能读取，所以必须要靠win32转换成docx；
2、对docx的库使用不多，所以提取和写入表格的代码都是百度了好久获得的；
3、写入excel不难，只是参会人要把“出席人员”、“列席人员”和“缺席人员”组合起来，会议时间也只要日期不要时间；
4、最后还需要把word转成pdf，又使用了win32的库。
三、具体代码

# coding=gbk
import docx
from win32com.client import Dispatch
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
import os
import datetime
import traceback
def excel_pre():
    '''启动excel和路径设置'''
    global xl
    xl = Dispatch("Excel.Application")
    xl.Visible = False #True是显示， False是隐藏
    xl.DisplayAlerts = 0
def doc2Docx(fileName):
    '''将doc转换为docx'''
    word = Dispatch("Word.Application")
    doc = word.Documents.Open(fileName)
    doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False)
    os.remove(fileName)
    doc.Close()
    word.Quit()
def dict_judge(text):
    '''判断text中是否存在dict中的key'''
    num_dict = {'一、':1, '二、':2, '三、':3, '四、':4, '五、':5, '六、':6, '七、':7, '八、':8, '九、':9}
    for key,value in num_dict.items():
        if key in text:
            return value
    return 0
def notice_time(meeting_time, timedelta=-2):
    '''根据会议时间倒推会议通知时间'''
    a = datetime.datetime.strptime(meeting_time, "%Y-%m-%d")
    b = a + datetime.timedelta(days=timedelta)
    c = b.strftime('%Y-%m-%d')
    return c
def report_time(meeting_time, num):
    '''模拟生成汇报时间report_time'''
    if '午' in meeting_time:
        meeting_time = meeting_time.replace('午', '0')
    a = datetime.datetime.strptime(meeting_time, '%H:%M')
    if num ==1:
        return meeting_time
    else:
        b = a + datetime.timedelta(minutes=10*(num-1))
        c = b.strftime('%H:%M')
        return c
def doc2pdf(input_file):
    '''把word转成pdf'''
    word = Dispatch('Word.Application')
    doc = word.Documents.Open(input_file)
    doc.SaveAs(input_file.replace(".docx", ".pdf"), FileFormat=17)
    doc.Close()
    word.Quit()
def set_font(paragraph):
    '''设定word中的字体大小'''
    paragraph.paragraph_format.left_indent = Pt(0) #取消左缩进
    paragraph.paragraph_format.right_indent = Pt(0)  # 取消右缩进
    run = paragraph.runs
    font = run[0].font
    font.size= Pt(14) #14对应四号字体
    # font.bold = True #加粗
def get_meeting_info(docname):
    '''提取会议纪要里的内容'''
    doc = docx.Document(meeting_file_path + docname)
    meeting_name = docname.replace('纪要.docx', '')
    meeting_name = '上海NOC ' + meeting_name
    meeting_dict = {'会议名称':meeting_name, '会议时间': '', '主持人': '', '参会人': ''}
    notice_dict = {'会议名称':meeting_name, '会议时间':'', '会议地点':'', '会议主持':'', '出席人员': ''}
    joiner = ''
    for i, paragraph in enumerate(doc.paragraphs):
        this_text = paragraph.text
        num = dict_judge(this_text)
        if '会议时间' in this_text:
            this_text = this_text.split('：')[1]
            notice_dict['会议时间'] = this_text
            meeting_date = this_text[:10] #会议日期
            meeting_time = this_text[-5:] #会议时间
            notice_date = notice_time(meeting_date)
            meeting_dict['会议时间'] = meeting_date
            notice_dict['通知时间'




    
] = notice_date
        elif '会议地点' in this_text:
            this_text = this_text.split('：')[1]
            notice_dict['会议地点'] = this_text
        elif '会议主持' in this_text:
            this_text = this_text.split('：')[1]
            meeting_dict['主持人'] = notice_dict['会议主持'] = this_text
        elif '出席人员' in this_text:
            this_text = this_text.split('：')[1]
            # notice_dict['出席人员'] = this_text
            joiner = joiner + this_text + '、'
        elif '列席人员' in this_text:
            this_text = this_text.split('：')[1]
            joiner = joiner + this_text + '、'
        elif '缺席人员' in this_text:
            this_text = this_text.split('：')[1]
            if this_text == '无':
                joiner = joiner[:-1]
            else:
                joiner = joiner + this_text
        elif num:
            this_title = this_text.split('、', 1)[1].replace('。', '')
            try:
                this_reporter = doc.paragraphs[i+1].text.split('：', 1)[1]
                if '回避' in this_reporter:
                    this_reporter = this_reporter.split('（', 1)[0]
            except:
                this_reporter = ''
            if '列席人' in this_reporter:
                this_reporter, liexiren = this_reporter.split('（', 1)
                liexiren = liexiren.split('：')[1].replace('）', '')
            else:
                liexiren = ''
            this_report_time = report_time(meeting_time, num)
            notice_dict[num] = [this_title, this_reporter, liexiren, this_report_time]
    meeting_dict['参会人'] = joiner
    if '（开会）' in joiner:
        joiner = joiner.replace('（开会）', '')
    notice_dict['出席人员'] = joiner
    return meeting_dict, notice_dict
def make_report(report_dict):
    '''制作会议通知'''
    doc = docx.Document('会议通知模板.docx')
    #插入主要通知内容
    doc_cell_dict = {'会议名称': (0, 1), '会议时间': (1, 1), '会议地点': (2, 1), '会议主持': (3, 1), '出席人员': (4, 1)}
    for key, value in doc_cell_dict.items():
        doc.tables[0].rows[value[0]].cells[value[1]].text = report_dict[key]
        paragraph = doc.tables[0].rows[value[0]].cells[value[1]].paragraphs[0]
        set_font(paragraph)
    #插入几个会议内容
    for i in range(1, len(report_dict)-5):
        doc.tables[0].rows[i + 5].cells[1].text = report_dict[i][0]  # 会议内容
        doc.tables[0].rows[i + 5].cells[2].text = report_dict[i][1]  # 汇报人
        doc.tables[0].rows[i + 5].cells[3].text = report_dict[i][2]  # 专题列席人
        doc.tables[0].rows[i + 5].cells[4].text = report_dict[i][3]  # 汇报时间
        for j in range(1, 5):
            paragraph = doc.tables[0].rows[i + 5].cells[j].paragraphs[0]
            set_font(paragraph)
    #创建新的格式
    try:
        style1 = doc.styles['style head1']
    except:
        style1 = doc.styles.add_style('style head1', 2)
    finally:
        style1.font.bold = True
        style1.font.name = u'宋体 (中文正文)'
        style1.font.size = Pt(14)
    #在最后插入通知时间
    e = doc.add_paragraph()
    e.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT #右对齐
    e.add_run(report_dict['通知时间'], style=style1)
    #最后根据名字保存会议通知
    doc.save(report_file_path + '会议通知：'+ report_dict['会议名称'] + '.docx')
    #把会议通知转成PDF格式
    doc2pdf(report_file_path + '会议通知：'+ report_dict['会议名称'] + '.docx')
def make_newxls(meeting_dict, newrow):
    '''把meetring的信息导入到excel里'''
    ws.Cells(newrow, 1).Value = meeting_dict['会议名称']
    ws.Cells(newrow, 3).Value = '党委（党组）会'
    ws.Cells(newrow, 4).Value = '现场会议'
    ws.Cells(newrow, 5).Value = meeting_dict['会议时间']
    ws.Cells(newrow, 6).Value = meeting_dict['主持人']
    ws.Cells(newrow, 7).Value = meeting_dict['参会人']
if __name__ == "__main__":
    #如果原始文档是doc格式的话，就需要先批量转换为docx
    # for f in os.listdir('会议纪要库'):
    #     if f.endswith('.doc'):
    #         doc2Docx(meeting_file_path + f)
    meeting_file_path = os.path.abspath('.') + '\\' + '会议纪要库' + '\\'
    report_file_path = os.path.abspath('.') + '\\' + '会议通知库' + '\\'
    this_path = os.path.abspath('.') + '\\'
    excel_pre()
    wb = xl.Workbooks.Open(this_path + '决策会议采集模板.xls')
    ws = wb.Sheets('决策会议')
    n = 3
    try:
        for f in os.listdir('会议纪要库'):
            if f.endswith('.docx'):
                print(f)
                # doc2pdf(meeting_file_path + f)   # 把会议通知转成PDF格式（在“会议纪要库”里）
                meeting_dict, notice_dict = get_meeting_info(f)
                make_report(notice_dict) # 生成会议通知（在“会议通知库”里）
                make_newxls(meeting_dict, n) #把记录放在《决策会议采集模板.xls》里
                n +=1
    except:
        traceback.print_exc()
    finally:
        wb.Save()
        wb.Close()
                    一、前言新部门接到一个新需求，要求根据以前的会议纪要，提取相关信息（如下图所示）包括了会议名称、时间、地点、主持人、出席人员、列席人员、缺席人员、会议内容、汇报人、列席人等等，然后要生成两样东西：1、会议通知右下角是会议通知时间，根据会议时间往前倒推两天自动生成。2、会议总表二、主要难点1、原来的文件都是doc格式的，python的docx库不能读取，所以必须要靠win32转换...
				你可以使用 Python 的正则表达式来实现这个功能。首先，你需要导入 Python 的 re 模块，然后使用 re.findall 函数来查找所有包含多个关键词的句子。
例如，如果你想在文档中查找所有包含 "keyword1" 和 "keyword2" 的句子，你可以使用以下代码：
import re
# 读取文档内容
with open('document.txt', 'r') as f:
				文章目录一、需求二、分析及思路三、整体代码
根据关键词，从Word文档里的表格中提取所需要的数据汇总到Excel中，并汇总到Excel中做台帐。
二、分析及思路
常规表格中，我们一般会通过横向填写或者竖向填写的方式来进行内容的填写，有些单元格还会有合并的情况，如下图所示，通过 python 的 docx 模块，可以完成 word 文档的信息提取：
2.1 遍历文件夹中，需要提取的 docx 文件（这里最好是把将要提取的文件放在一个文件夹中）
2.2 通过 docx 模块里面的 table 方法，
				可以使用Python的第三方库python-docx和openpyxl来实现。首先使用python-docx读取word文档，然后找到表格内容并提取，最后使用openpyxl将表格内容保存到excel中。具体实现可以搜索相关教程或者使用以下代码作为参考：
```python
import docx
from openpyxl import Workbook
# 打开word文档
doc = docx.Document('example.docx')
# 创建一个excel文件
wb = Workbook()
ws = wb.active
# 遍历文档中的表格
for table in doc.tables:
    # 遍历表格中的行
    for i, row in enumerate(table.rows):
        # 创建一个excel的行
        excel_row = []
        # 遍历行中的单元格
        for cell in row.cells:
            # 将单元格的内容添加到excel_row列表中
            excel_row.append(cell.text)
        # 在excel中插入一行数据
        ws.append(excel_row)
# 保存excel文件
wb.save('example.xlsx')
注意：以上代码只能提取简单的表格，如果表格中包含合并单元格或者其他复杂的格式，可能需要进行额外的处理。
                    CSDN-Ada助手: 
                    嗨～好久未见你更新博文了，我们现在上线了AI创作助手哦～可为你的创作提供智能化帮助，快来试试吧～https://editor.csdn.net/md/?not_checkout=1&utm_source=blog_comment_recall，在编辑器页面右侧哦～～限免！！
同时我们为您准备了一份回归奖励，快来看看吧https://activity.csdn.net/creatActivity?id=10430&utm_source=blog_comment_recall
                Python：初步完成IE浏览器的查询、点击和文件下载
                    车岗周柏豪: 
                    看到了OA和用不了谷歌浏览器，和某些字眼，同是运营商，公司让我开发一个自动取数脚本，各种不兼容把我搞死
                使用Python计算基尼系数
                    xiaoyuanyue: 
                    请问data_list中的数据分别代表什么意思
                机器学习：XGBoost+LightGBM+catboost+5折+stacking的用法
                    网名太难取664: 
                    但是你的test_data没有定义啊
                通过VBA和Python把excel的结果数据转化成word点评
                    如果表格里的数据变动了，文字里的相应数据会随之变动吗