读网址爬取网页

最新推荐文章于 2024-09-20 09:39:51 发布

chun_yin

最新推荐文章于 2024-09-20 09:39:51 发布

阅读量312

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/chun_yin/article/details/113094961

版权

该博客展示了如何使用Python的openpyxl库读写Excel文件，包括创建表格、追加数据等操作。同时，它实现了一个多进程爬虫，利用BeautifulSoup解析网页并提取数据，通过队列和多进程进行数据抓取和处理，提高了爬取效率。最后，将爬取的数据更新到Excel表格中。

摘要由CSDN通过智能技术生成

xie_xlsx

# coding=gbk
import openpyxl 
import re  

class xie_xlsx():
    def __init__(self,name,biandan='表01'):   
        self.name01=name   
        self.biaodan=biandan 

    def dubiao(self):   #把文件里的内容按行读到列表zhuan02
        workbook = openpyxl.load_workbook(self.name01) #一个Workbook对象代表一个Excel文档
        sheet = workbook[self.biaodan]    #选取表单
        #读取表单数据
        zhuan02=[]  #列表
        for row in sheet.rows:
            zhuan01=[]
            for cell in row:       #按行读
                zhuan01.append(cell.value)
            zhuan02.append(zhuan01)
        return zhuan02

    #创建表格
    def chuangbiao(self,tou):
        workbook = openpyxl.Workbook()  #创建文档
        sheet = workbook.active 
        sheet.title =self.biaodan  #工作表的名字
        sheet.append(tou)  
        workbook.save(self.name01) #保存文件
        print("表格创建成功。。。。。。。")

    # 对表进行修改，追加
    def xiubiao02(self,data_list):   #把data_list里的数读到zhuan01列表，并添加到表单里
        wb = openpyxl.load_workbook(self.name01)   #打开工作簿
        ws = wb[self.biaodan] 
        zhuan01=[]
        for i01 in data_list:
            zhuan01.append(str(i01))

        ws.append(zhuan01)
        wb.save(self.name01)
        print('数据已更新。。。。。。。。。。。。')

    # 对表进行修改，追加
    def xiubiao03(self,data_list):
        wb = openpyxl.load_workbook(self.name01)
        ws = wb[self.biaodan]
        for data in data_list: 
            zhuan01=[]
            for i01 in data:
                zhuan01.append(str(i01))

            ws.append(zhuan01)

        wb.save(self.name01)
        print('数据集已更新。。。。。。。。。。。。')

    def xiubiao04(self,data_list):   
        wb = openpyxl.load_workbook(self.name01)   #打开工作簿
        ws = wb[self.biaodan] 
        for i01 in data_list:
            zhuan01=[]
            zhuan01.append(str(i01))
            ws.append(zhuan01)
        wb.save(self.name01)
        print('数据已更新。。。。。。。。。。。。')

try:
    from 我的工具 import xie_xlsx
except:
    import xie_xlsx
from bs4 import BeautifulSoup
import time 
import requests  #爬取
import re
import openpyxl
import multiprocessing
yue01=xie_xlsx.xie_xlsx('爬虫结果终601-1200.xlsx')   
yue01.chuangbiao([])

def qing(url):
    while 1:
        try:    
            headers = {
                        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
            }  

            requests.packages.urllib3.disable_warnings()
            r = requests.get(url,headers=headers,verify=False,timeout=10)  
            r.encoding='utf-8'     

            return r.text
        except Exception as e: 
            print('请求错误：',e)
            sleep(1) 

def run(in_q, out_q):
    while in_q.empty() is not True:
        url=in_q.get()
        print(url)
        data01 = qing(url)
        bs01=BeautifulSoup(data01,'lxml')   
        n01=bs01.find_all('tr')                                   
        for i01 in l01:
            data01=data01.replace(str(i01),'')
        bs01=BeautifulSoup(data01,'lxml')
        n01=bs01.find_all('tr')    
        nr_list=[]
        for i01 in range(0,len(n01)):
            n02=n01[i01].find_all(re.compile('th|td'))
            n002=[x.text.replace('\n',' ').strip() for x in n02]  
            if len(n002)>1 and n002[0].strip():     
                nr_list=nr_list+n002 
        out_q.put(nr_list)
        in_q.task_done()
        return out_q

def get_link():
    #读取表格部分，表格里是已经爬取的网址
    data = openpyxl.load_workbook('D:/学习/爬取程序/网址601-1200.xlsx')
    print('读取成功')
    table = data['表01']
    return table

#多进程操作
if __name__ == '__main__':
    start = time.time()
    print(start)
    queue = multiprocessing.Manager().Queue()
    result_queue = multiprocessing.Manager().Queue()
    print('构建成功')
    table = get_link()
    zhuan02=[]
    for row in table.rows:
        for cell in row:
            queue.put(cell.value)   #添加队列 
            zhuan02.append(cell.value)
    print(zhuan02)
    print('queue 开始大小 %d' % queue.qsize())
    pool = multiprocessing.Pool(8)  # 异步进程池（非阻塞）
    for index in range(len(zhuan02)):
        pool.apply_async(run, args=(queue, result_queue,))   
        print('第{}个。。。。'.format(index))
    pool.close()
    print('close')
    pool.join()
    print('join')
    time.sleep(2)
    isempty = queue.empty()  #队列为空则True
    print(isempty)
    queue.join()
    end = time.time()
    print(end)
    for i in range(len(zhuan02)):
        line = result_queue.get()
        yue01.xiubiao02(line)
        print('第{}个'.format(i))
    print('总耗时：%s' % (end - start))
    print('queue 结束大小 %d' % queue.qsize())
    print('result_queue 结束大小 %d' % result_queue.qsize())