上一篇展示了Python读取word文档正文内容的方法与过程,不再赘述。本文主要展示用正则表达式提取Word文档中英文内容的过程。
一、导入的三方库:
from zipfile import ZipFile
from io import BytesIO
from bs4 import BeautifulSoup
import re
import pandas as pd
import pymysql
import xlrd
二、用正则表达式分别提取中英文
# 读取word文档的文本
wordFile = open('/home/lijiang/Excel人员数据/码农必会词汇表.docx', 'rb').read()
wordFile = BytesIO(wordFile)
document = ZipFile(wordFile)
xml_content = document.read('word/document.xml')
wordObj = BeautifulSoup(xml_content.decode('utf-8'), features="lxml")
textStr = wordObj.findAll('w:t')
# 用正则表达式提取英文词组并存入列表
list_words = []
list_chinese = []
for textElem in textStr:
print(textElem.text)
extract = ''.join(re.findall(r'[A-Za-z]', textElem.text))
list_words.append(extract)
# 用正则表达式提取中文注释并存入列表
for textElem in textStr:
result_str = re.sub(u'([^\u4e00-\u9fa5])', '', textElem.text)
list_chinese.append(result_str)
# 倒序移除列表中的空串元素
for chinese in range(len(list_chinese) - 1, -1, -1):
if list_chinese[chinese] == '':
del list_chinese[chinese]
for words in range(len(list_words) - 1, -1, -1):
if list_words[words] == '':
del list_words[words]
print(list_words)
print(list_chinese)
三、提取数据写入Excel并插入MySQL
# 写入excel工作表
result_str = pd.DataFrame(list_chinese, index=list_words)
print(result_str)
result_str.to_excel('/home/lijiang/Excel人员数据/码农必会词汇表.xlsx', sheet_name='sheet1')
# 读取excel,写入MySQL数据库表
conn = pymysql.connect(
host='localhost',
user='root',
password='',
charset='utf8',
port=3306,
db='Excel人员数据'
)
cursor = conn.cursor()
File = '/home/lijiang/Excel人员数据/码农必会词汇表.xls'
excel = xlrd.open_workbook(File)
sheet = excel.sheet_by_index(0)
rows = sheet.nrows
cap = []
for i in range(rows):
x = sheet.row_values(i)
cap.append(x)
print(cap)
for words in cap:
word = words[0]
explain = words[1]
print(word, explain)
cursor.execute(f'insert into 码农必会词汇表(词组,注释) values (%s,%s)', words)
conn.commit()
conn.close()
print('insert into table complete!')
四、总结
本次自我提出业务需求,自己去通过Python实现自动化项目的功能,实现了巩固夯实Python基础。是一次很好的练习实践。