# coding = utf-8
"""
1.常用模块学习记录
"""
import docx
import xlrd
from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
import re
import xlwt
import random
class Study(object):
"""常用模块学习记录"""
def __init__(self):
"""初始化"""
self.user_agent = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
]
num = random.randint(0, 3) # 生产0-3的随机数
self.headers = {
"User-Agent": self.user_agent[num]
}
def read_from_docx(self):
"""读取docx: https://zhuanlan.zhihu.com/p/38251812文件"""
# 1.打开word文件,读取word文本
file = docx.opendocx("./XXX.docx")
text_list = docx.getdocumenttext(file) # type: list
# print(type(text_list), text_list)
# 2.将列表中的内容按空格拼接为字符串
text = " ".join(text_list) # type: str
print(type(text), text)
return text
def read_from_excel(self):
"""读取Excel文件:https://www.jianshu.com/p/f2c9dff344c6"""
# 打开Excel表格
workbook1 = xlrd.open_workbook("./XXX.xlsx") # xlrd.book.Book object
workbook2 = xlrd.open_workbook("./XXX.xls")
# 获取所有sheet名称
sheet_names1 = workbook1.sheet_names() # type: list
sheet_names2 = workbook2.sheet_names() # type: list
# print(sheet_names1, sheet_names2)
# 获取所有sheet对象
sheet_objects = workbook1.sheets() # [<xlrd.sheet.Sheet object at 0x00000215E93774F0>]
# 通过index获取第一个sheet对象
sheet1_object = workbook1.sheet_by_index(0) # <xlrd.sheet.Sheet object at 0x0000015D21327490>
# 通过name获取第一个sheet对象
sheet1_object = workbook1.sheet_by_name(sheet_name="Sheet1") # <xlrd.sheet.Sheet object at 0x0000015D21327490>
# 获取sheet1中第3行的数据
row3_values = sheet1_object.row_values(rowx=2) # type: list
print(row3_values)
# 获取sheet1中第3行第2列数据
row3_column2_value = sheet1_object.row_values(2, 1, 2) # type: list
# row3_column2_value = sheet1_object.row_values(rowx=2, start_colx=1, end_colx=2) # type: list
print(row3_column2_value)
def read_from_pdf(self):
"""读取pdf文件:https://blog.csdn.net/Artificial_idiots/article/details/108733280"""
pdf_filename = 'F:/XX/文献资料/XX.pdf'
with open(pdf_filename, "rb") as pdf:
# 创建PDF资源管理器
resource = PDFResourceManager()
# 创建一个能在内存中读写str的对象
rw_str = StringIO()
# 创建一个PDF参数分析器
laparam = LAParams()
# 创建一个PDF设备对象
device = TextConverter(resource, rw_str, laparams=laparam)
# 解析PDF文件
process_pdf(resource, device, pdf)
# 关闭设备对象
device.close()
# 获取内存中写入的str
text = rw_str.getvalue() # type:str
# 关闭读写对象
rw_str.close()
# 获取所有行
text_list = text.split("\n") # type:list
print(text, text_list)
def save_to_docx(self):
"""将数据写入docx文件:https://blog.csdn.net/cloveses/article/details/81668797"""
pass
def save_to_excel(self):
"""将数据保存到Excel文件:https://openpyxl.readthedocs.io/en/stable/"""
# 创建一个Excel文件
file = xlwt.Workbook()
# 创建sheet
sheet1 = file.add_sheet(u'sheet1', cell_overwrite_ok=True)
# 设置列宽
# 列宽xlwt中列宽的值表示方法:默认字体0的1/256为衡量单位。
# xlwt创建时使用的默认宽度为2960,既11个字符0的宽度
# 所以我们在设置列宽时可以用如下方法:
# width = 256 * N 256为衡量单位,N表示N个字符宽度
sheet1.col(0).width = 256 * 20
sheet1.col(1).width = 256 * 50
# 将词频数据写入第 i 行,第 j 列, i,j从0开始
# 先写入表头标题
sheet1.write(0, 0, "标题") # 第1行第1列
sheet1.write(0, 1, "名称") # 第1行第2列
file.save("./测试文件.xls") # 只能保存为.xls文件,不支持保存为.xlsx
def xml_to_str_by_re(self):
"""通过正则表达式从XML文件中提取数据"""
with open("./result-rcn-184713_en.xml", "r", encoding="utf-8") as file:
data = file.read() # type: str
# print(type(data), data)
pat = "<.*?>"
rep = ""
rcn = re.search(r"<rcn>[0-9]{6}</rcn>", data).group()
id = re.search(r"<id>.*?</id>", data).group()
summary = re.search(r"<summary>[^<]*?</summary>", data).group()
workPerformed = re.search(r"<workPerformed>[^<]*?</workPerformed>", data).group()
finalResults = re.search(r"<finalResults>[^<]*?</finalResults>", data).group()
displayCode = re.findall(r"<displayCode.*?>[^<]*?</displayCode>", data) # type: list
rcn = re.sub(pat, rep, rcn)
id = re.sub(pat, rep, id)
summary = re.sub(pat, rep, summary)
workPerformed = re.sub(pat, rep, workPerformed)
finalResults = re.sub(pat, rep, finalResults)
for item in displayCode:
displaycode = re.sub(pat, rep, item)
print(displaycode)
print(displayCode)
def main():
s = Study()
# text = s.read_from_docx() # 读取docx文本
# s.read_from_excel() # 读取Excel文本
# s.read_from_pdf() # 读取PDF文本
# s.save_to_excel() # 保存到Excel文件
s.xml_to_str_by_re()
if __name__ == '__main__':
main()
03-14
3840
![](https://csdnimg.cn/release/blogv2/dist/pc/img/readCountWhite.png)