from fake_useragent import UserAgent import requests from lxml import etree #发送请求 class Downloader(): def do_download(self,url): print(url) heards = {'User-Agent': UserAgent().random} resp = requests.get(url,headers=heards) if resp.status_code==200: resp.encoding='utf-8' return resp.text #数据解析 class Parser(): def do_parse(self,html): e = etree.HTML(html) contents= [div.xpath('string(.)') for div in e.xpath('//div[4]/div[1]/div[1]/dl/dd/text()')] #contents= e.xpath('//div[4]/div[1]/div[1]/dl/dd/text()') urls=['https://www.biedoul.com{}'.format() for url in e.xpath('//div[4]/div[1]/div[2]/a/@href')] return contents,urls #数据保存 class DataOutPut(): def do_save(self,datas): with open('duanzi.txt','a',encoding='utf-8') as f: for data in datas: f.write(data+'\n') #URL管理器 class URLManager(): def __init__(self): self.new_url=set() self.old_url=set() #加入一个URL def add__new__url(self,url): #加个判断,爬过的不加入 if url is not None and url !='' and url not in self.old_url: self.new_url.add(url) #加入多个URL def __add__new__urls(self,urls): for url in urls: self.add__new__url(url) #获取一个URL def get_new_url(self): #做记录 url=self.new_url.pop() self.old_url.add(url) return url #获取还有多少个URL要爬取 def get_new_url_size(self): return len(self.new_url) #获取是否还有URL要爬取 def have_new_url(self): return self.get_new_url_size()>0 # 调度器 class Scheduler: def __init__(self): self.downloader=Downloader() self.parser=Parser() self.data_out_put=DataOutPut() self.url_manager=URLManager() def start(self,url): self.url_manager.add__new__url(url) while self.url_manager.have_new_url(): url=self.url_manager.get_new_url() html=self.downloader.do_download(url) datas,urls=self.parser.do_parse(html) self.data_out_put.do_save(datas) self.url_manager.add__new__urls(urls) if __name__=='__main__': scheduler=Scheduler() url='https://www.biedoul.com/t/5YaF5ra15q615a2Q_1.html' #输入爬取的网址 scheduler.start(url)
有没有大佬帮忙看看怎么解决
最新推荐文章于 2024-08-09 00:05:38 发布