1 python操作csv文件
import csv
from itertools import islice
# 写入csv文件
def write_csv_custom(csv_path):
# 中文乱码可以使用encoding="utf_8_sig"
with open(csv_path, "w", newline="", encoding="UTF-8") as csv_file:
# 自定义特定属性下添加‘"’,例如:数字不能添加双引号,字符串需要添加双引号
csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_NONE, quotechar="")
# 写入数据
csv_writer.writerow([1, '"English"', '"python"'])
# 写入csv文件
def write_csv(csv_path):
# 中文乱码可以使用encoding="utf_8_sig"
with open(csv_path, "w", newline="", encoding="UTF-8") as csv_file:
# 软件不会自动添加双引号
csv_writer = csv.writer(csv_file)
# 写入数据
csv_writer.writerow(['Chinese', 'English', 'python'])
# 读csv文件
def read_csv(csv_path):
with open(csv_path, "r", encoding="UTF-8") as csv_file:
csv_reader = csv.reader(csv_file)
# 读取每一行
for row in csv_reader:
print(row)
# 跳过前n行
with open(csv_path, "r", newline="", encoding="UTF-8") as csv_file:
csv_reader = csv.reader(csv_file)
# 读取每一行
for row in islice(csv_reader, 20, None):
print(row)
if __name__ == '__main__':
path = "/home/mason.csv"
write_csv(path)
read_csv(path)
2 python操作json文件
import json
# 写json文件
def write_json(json_path):
with open(json_path, "w", encoding="UTF-8") as json_file:
# 写入一个键值对
text_dict = dict(data="chinese")
json.dump(text_dict, ensure_ascii=False, fp=json_file, indent=4)
# 读取json文件
def read_json(json_path):
with open(json_path, "r", encoding="UTF-8") as json_file:
# text是对象
text = json.load(json_file)
print(text["data"])
if __name__ == '__main__':
path="/home/mason.json"
write_json(path)
read_json(path)
3 python操作excel文件
使用pandas读取excel,也可使用xlrd操作excel文件
import pandas as pd
def read_excel(excel_path):
df_data = pd.read_excel(excel_path)
# 读取行
for row in df_data.itertuples():
# 读取行的第一列
print(row._1)
4 python读pdf
方法1:使用pymupdf读取pdf中的文本(推荐此方法)。
注意:下面的方法是旧版本,新版本有新的方法,可参考官网。此外:pymupdf结合RAG技术出了了新的包 pymupdf4llm。
import fitz
# 此种方法可以解析含有“许可口令”的pdf文件
def parse_pdf(pdf_path):
doc_obj = fitz.open(pdf_path)
pages = doc_obj.pages()
text = ""
for page in pages:
temp_text = page.get_text()
# 清洗text
temp_text = temp_text.encode(errors="utf-8").decode(errors='ignore', encoding="utf-8")
temp_text = temp_text.replace('\t', '').replace('\n', '').replace('\r\n', '')
temp_text = ' '.join(temp_text.split())
text = text + temp_text
return text
方法2:使用PyPDF2读取pdf文件
pip install PyPDF2
import PyPDF2
from PyPDF2.errors import DependencyError
def parse_pdf(pdf_path):
pdf_file_obj = open(pdf_path, 'rb')
try:
pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
except DependencyError as e:
print(e)
return "-1"
# 判断文件是否加密
if pdf_reader.isEncrypted:
# decrypt方法是输入加密的密码对返回1,错返回0
return "-1"
# 获取pdf文件的页数
page_len = pdf_reader.numPages
text = ""
for page_num in range(page_len):
cur_page = pdf_reader.getPage(page_num)
temp_text = cur_page.extractText()
# 清洗text
temp_text = temp_text.encode(errors="utf-8").decode(errors='ignore', encoding="utf-8")
temp_text = temp_text.replace('\t', '').replace('\n', '').replace('\r\n', '')
temp_text = ' '.join(temp_text.split())
text = text + temp_text
return text
方法3:使用pdfminer3k读取pdf文件(不推荐)
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
def parse_pdf(pdf_path):
pdf_io = open(pdf_path, 'rb')
pdf_parser = PDFParser(fp=pdf_io)
pdf_doc = PDFDocument()
pdf_parser.set_document(doc=pdf_doc)
pdf_doc.set_parser(parser=pdf_parser)
# 初始化数据
pdf_doc.initialize()
if not pdf_doc.is_extractable:
print('The file is can not extract!')
return
else:
# 设置管理器
pdf_manager = PDFResourceManager()
pdf_laparams = LAParams()
pdf_device = PDFPageAggregator(rsrcmgr=pdf_manager, laparams=pdf_laparams)
# 读取docx
pdf_interpreter = PDFPageInterpreter(rsrcmgr=pdf_manager, device=pdf_device)
text = ""
for page in pdf_doc.get_pages():
pdf_interpreter.process_page(page)
layout = pdf_device.get_result()
for x in layout:
try:
if isinstance(x, LTTextBoxHorizontal):
temp_text = x.get_text()
# 清洗数据
temp_text = temp_text.encode(errors="utf-8").decode(errors='ignore', encoding="utf-8")
text = text + temp_text.replace('\t', '').replace('\n', '').replace('\r\n', '')
except:
print(pdf_path + " Failed")
return text
5 python操作docx
使用python-docx读取docx文件
注意:doc此方法不适用,读取doc文件需要使用pypiwin32包
def parse_docx(docx_path):
# 读取docx文件
file_docx = docx.Document(docx_path)
text = ""
# 读取段落
for paragraph in file_docx.paragraphs:
temp_text = str(paragraph.text)
temp_text = temp_text.encode(errors="utf-8").decode(errors='ignore', encoding="utf-8")
temp_text = temp_text.replace('\t', '').replace('\n', '').replace('\r\n', '')
temp_text = ' '.join(temp_text.split())
# 删除空段落
if temp_text != "":
text = text + temp_text
return text
6 处理特殊字符
import re
def remove_character(text):
# 删除网络地址
text = re.sub('[http|https]*://[a-zA-Z0-9.?/&=:]*', '', text)
# 删除连续的两个"."
text = re.sub("\.{2,}", "", text)
# 保留中文、英文、数字和特殊字符
text = re.sub("[^\u4e00-\u9fa5^^a-z^A-Z^0-9 .。,,??::\]\[<《>》]", "", text)
return text
if __name__ == '__main__':
text = "....................zhong过★、…【】↓▼▲◆●《》"
print(remove_character(text))
7 doc转换为docx
需要安装pypiwin32包
from win32com import client
# 安装pypiwin32包
# pip install pypiwin32
def doc_to_docx(doc_path, docx_path):
try:
# 获取Word应用
word = client.Dispatch("Word.Application")
doc = word.Documents.Open(doc_path)
# 另存为".docx"文件,参数12或16表示docx文件
doc.SaveAs(docx_path, 16)
doc.Close()
word.Quit()
except Exception as e:
print(e)
return -1
8 重命名文件名称
import os
import shutil
# dir_old为原数据目录,dir_new为重命名后数据目录
def rename_file(dir_old, dir_new):
# 获取文件列表
filename_list = os.listdir(dir_old)
for i in range(len(filename_list)):
# 获取文件名称
file_name = filename_list[i]
# 获取文件后缀
file_suffix = os.path.splitext(file_name)[-1]
# 复制文件,并重命名
shutil.copy(os.path.join(dir_old, file_name), os.path.join(dir_new, "file_" + str(i) + file_suffix))
dir_old = r'E:\my_data'
dir_new = r'E:\my_data_new'
rename_file(dir_old, dir_new)
9 pdf转化为word
需要安装pdf2docx
from pdf2docx import Converter
def pdf_to_word(pdf_path, docx_path):
cv = Converter(pdf_path)
cv.convert(docx_path, start=0, end=None)
cv.close()
pdf_to_word("E:/test.pdf", "E:/111.docx")
10 pdf转化为img
"""
# 方法1使用python-office
# 安装python-office包
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple python-office -U
import office
# 一行代码,实现转换
# 参数说明:
# pdf_path = 你的PDF文件的地址
# out_dir = 转换后的图片存放地址,可以不填,默认是PDF的地址
office.pdf.pdf2imgs(
pdf_path='E:/xi_data_special/file_pwd.pdf',
out_dir='E:/xi_img'
)
"""
# 方法2使用pymupdf中的fitz
# 安装pymupdf包
# pip install pymupdf
import os
import fitz
def pdf_to_img(pdf_path, img_dir):
doc_obj = fitz.open(pdf_path)
for i in range(doc_obj.page_count):
page = doc_obj.load_page(i)
pixmap = page.get_pixmap()
pixmap.save(os.path.join(img_dir, str(i)+".png"))
pdf_path = "E:/xi_data_special/file_pwd.pdf"
img_dir = "E:/xi_img"
pdf_to_img(pdf_path, img_dir)