import re import openpyxl import requests from bs4 import BeautifulSoup headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0'} link = "https://www.instrument.com.cn/newproduct/" r = requests.get(link, headers=headers, timeout=20) def parse_url(html_txt): pict = re.compile(r'[a-zA-z]+://[^\s]*\.jpg') # 正则获取.jpg结尾的链接 pic_list = re.findall(pict, html_txt) return pic_list def download(file_path, pic_url): pic_req = requests.get(pic_url, headers) with open(file_path, 'wb') as f: f.write(pic_req.content) # 写入excel def write_excel(data_list): full_excel = openpyxl.Workbook() full_sheet = full_excel.active for i in range(0, len(data_list)): full_sheet.append(data_list[i]) file_name = "D:\\dev\\python\\download\\instrument.xls" full_excel.save(file_name) full_excel.close()
python下载文件、图片和文字内容
于 2024-03-01 11:55:06 首次发布
该Python脚本利用BeautifulSoup和requests库从特定网站抓取产品信息,包括产品名称、品牌、类型等,并下载产品图片和样本文档(如PDF)。首先,它通过正则表达式提取图片URL,然后使用openpyxl将数据写入Excel文件。
摘要由CSDN通过智能技术生成