根据url链接下载word文档（doc、docx)、excel(xlsx、xls)、pdf、txt，并解析获取其中的内容（段落、表格）形式

有洁癖的懒羊羊

已于 2023-05-25 11:05:09 修改

阅读量1.1k

点赞数 1

分类专栏： python基础爬虫文章标签： word excel pdf python txt

于 2023-03-23 17:32:06 首次发布

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/liyunyang2000/article/details/129735440

版权

爬虫同时被 2 个专栏收录

18 篇文章

订阅专栏

9 篇文章

订阅专栏

目录

主函数的逻辑代码如下

根据url下载文件的函数如下

解析txt,返回txt文件内容

解析word文档，返回内容

解析excel，返回excel内容

解析pdf，返回pdf内容

主函数的逻辑代码如下

def main(link):
    if not os.path.isdir('./yuxi'):
        os.mkdir('./yuxi')
    files_path = './yuxi'
    rename_title = hashlib.md5(title.encode('utf-8'))
    title_md5 = rename_title.hexdigest()
    the_path = os.path.join(files_path, title_md5)
    if 'zip' in link or 'rar' in link:
        with open('yuxi.txt', 'a', encoding='utf-8') as file_txt:
            file_txt.write(url)
        continue
    if '.docx' in link:
        the_path = f'{the_path}.docx'
    elif '.doc' in link:
        the_path = f'{the_path}.doc'
    elif 'xlsx' in link:
        the_path = f'{the_path}.xlsx'
    elif 'xls' in link:
        the_path = f'{the_path}.xls'
    elif 'txt' in link:
        the_path = f'{the_path}.txt'
    else:
        the_path = f'{the_path}.pdf'
    state = download_file(link, the_path)
    if state:
        if '.docx' in link:
            new_content = convert_doc_to_txt(files_path, the_path, 'docx')
        elif '.doc' in link:
            new_content = convert_doc_to_txt(files_path, the_path, 'doc')
        elif 'xlsx' in link:
            new_content = convert_xlsx_to_txt(the_path)
        elif 'xls' in link:
            new_content = read_xls(the_path)
        elif 'txt' in link:
            new_content = read_txt(the_path)
        else:
            new_content = convert_pdf_to_txt(the_path)
    else:
        new_content = None
    if new_content:
        new_content = new_content.strip()
    if not new_content:
        new_content = None
    if not re.search(r'[\u4e00-\u9fa5]', new_content):
        new_content = None


if __name__ == '__main__':
    url = 'xxxxxxxxxx'
    main(url)

根据url下载文件的函数如下

def download_file(link, the_path):
    """下载文件."""
    try:
        response = req.get(link, stream=True, timeout=500)
        chunk_size = 1024
        with open(the_path, "wb") as file:
            for data in response.iter_content(chunk_size=chunk_size):
                file.write(data)
                # count += len(data)
            return True
    except req.RequestException:
        print(f'下载文件失败')
        return False

解析txt,返回txt文件内容

def read_txt(the_path):
    """txt文本解析."""
    with open(the_path, "rb") as file:
        txt = str(file.read())
    return txt

解析word文档，返回内容

def convert_doc_to_txt(p_path, pat, the_type):
    """解析doc文件."""
    txt = ""
    new_path = f"{pat}"
    if the_type == 'doc':
        # doc文件转为docx文件
        subprocess.run(
            ["/bin/libreoffice", "--headless", "--convert-to", "docx", "--outdir", p_path, pat],
            shell=False,
            check=True,
            encoding="utf-8",
        )
        new_path = f"{pat}x"
    # 直接读取docx文件
    try:
        file = docx.Document(new_path)
        # 段落
        for ele_p in file.paragraphs:
            txt += ele_p.text
        # 表格
        for table in file.tables:
            for row in table.rows:
                for cell in row.cells:
                    txt += cell.text
    except Exception as exception:
        print(f"ERROR !!! {exception}")
    # 读取完后删除docx文件
    os.remove(f"{pat}")
    return txt

解析excel，返回excel内容

def convert_xlsx_to_txt(file_name):
    """read excel."""
    wb = load_workbook(file_name)
    sheets = wb.get_sheet_names()
    # 第一个表格的名称
    sheet_first = sheets[0]
    # 获取特定的worksheet
    ws = wb.get_sheet_by_name(sheet_first)
    # 获取表格所有行和列，两者都是可迭代的
    rows = ws.rows
    # 迭代所有的行
    txt = ''
    for row in rows:
        txt_0 = ''
        for col in row:
            if col.value:
                txt_0 += str(col.value)
        txt += txt_0
    return txt


def read_xls(excel_name):
    """读取xls"""
    workbook = open_workbook(excel_name)

    sheet_name = workbook.sheet_names()
    sheet = workbook.sheet_by_name(sheet_name[0])

    rows = sheet.get_rows()
    txt = ''
    for row in rows:
        txt_0 = ''
        for col in row:
            if col.value:
                txt_0 += str(col.value)
        txt += txt_0
    return txt

解析pdf，返回pdf内容

def convert_pdf_to_txt(pdf_path):
    """pdf文本解析."""
    with pdfplumber.open(pdf_path) as pdf:
        content = ''
        try:
            for i in range(len(pdf.pages)):
                page = pdf.pages[i]
                page_content = '\n'.join(page.extract_text().split('\n')[:-1])
                content = content + page_content
        except Exception as exception:
            print(f"ERROR !!! {exception}")
    pdf.close()
    return content

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。