python实现用正则提取Word正文和写入Excel，并插入MySQL

m0_64880493_江哥

已于 2023-09-21 05:58:47 修改

阅读量346

点赞数 1

文章标签： python word excel

于 2023-09-19 22:00:12 首次发布

本文链接：https://blog.csdn.net/m0_64880493/article/details/133047772

版权

上一篇展示了Python读取word文档正文内容的方法与过程，不再赘述。本文主要展示用正则表达式提取Word文档中英文内容的过程。

一、导入的三方库：

from zipfile import ZipFile
from io import BytesIO
from bs4 import BeautifulSoup
import re
import pandas as pd
import pymysql
import xlrd

二、用正则表达式分别提取中英文

#  读取word文档的文本
wordFile = open('/home/lijiang/Excel人员数据/码农必会词汇表.docx', 'rb').read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
wordObj = BeautifulSoup(xml_content.decode('utf-8'), features="lxml")
textStr = wordObj.findAll('w:t')

# 用正则表达式提取英文词组并存入列表
list_words = []
list_chinese = []
for textElem in textStr:
    print(textElem.text)
    extract = ''.join(re.findall(r'[A-Za-z]', textElem.text))
    list_words.append(extract)

# 用正则表达式提取中文注释并存入列表
for textElem in textStr:
    result_str = re.sub(u'([^\u4e00-\u9fa5])', '', textElem.text)
    list_chinese.append(result_str)

# 倒序移除列表中的空串元素
for chinese in range(len(list_chinese) - 1, -1, -1):
    if list_chinese[chinese] == '':
        del list_chinese[chinese]

for words in range(len(list_words) - 1, -1, -1):
    if list_words[words] == '':
        del list_words[words]
print(list_words)
print(list_chinese)

三、提取数据写入Excel并插入MySQL

# 写入excel工作表
result_str = pd.DataFrame(list_chinese, index=list_words)
print(result_str)
result_str.to_excel('/home/lijiang/Excel人员数据/码农必会词汇表.xlsx', sheet_name='sheet1')

# 读取excel，写入MySQL数据库表
conn = pymysql.connect(
    host='localhost',
    user='root',
    password='',
    charset='utf8',
    port=3306,
    db='Excel人员数据'
)
cursor = conn.cursor()
File = '/home/lijiang/Excel人员数据/码农必会词汇表.xls'
excel = xlrd.open_workbook(File)
sheet = excel.sheet_by_index(0)
rows = sheet.nrows
cap = []
for i in range(rows):
    x = sheet.row_values(i)
    cap.append(x)
print(cap)
for words in cap:
    word = words[0]
    explain = words[1]
    print(word, explain)
    cursor.execute(f'insert into 码农必会词汇表(词组,注释) values (%s,%s)', words)
conn.commit()
conn.close()
print('insert into table complete!')