import requests
from fake_useragent import UserAgent
from lxml import etree
#url管理
class URLManger(object):
def __init__(self):
self.new_url=[]
self.old_url=[]
def get_new_url(self):
url = self.new_url.pop()
self.old_url.append(url)
return url
def add_new_url(self,url):
if url not in self.new_url and url and url not in self.old_url:
self.new_url.append(url)
def add_new_urls(self,urls):
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return self.get_new_url_size() > 0
def get_new_url_size(self):
return len(self.new_url)
def get_old_url_size(self):
return len(self.old_url)
#下载
class Downloader:
def download(self,url):
response = requests.get(url,headers = {'User-Agent':UserAgent().random})
if response.status_code==200:
response.encoding='utf-8'
return response.text
else:
return None
#解析
class Parser:
def parse(self,html):
e = etree.HTML(html)
data_s = self.parse_info(e)
urls = self.parse_url(e)
return data_s,urls
def parse_info(self,e):
spans = e.xpath('''//div[@class='content']/span''')
data = []
for span in spans:
data.append(span.xpath('string(.)'))
return data
def parse_url(self,e):
url_s = []
base_url='https://www.qiushibaike.com{}'
for url in e.xpath('''//ul[@class='pagination']/li/a/@href'''):
url_s.append(base_url.format(url))
return url_s
#数据处理
class DataOutput:
def save(self,data_s):
with open('duanzi.txt','a',encoding='utf-8') as f:
for data in data_s:
f.write(data)
#调度
class DiaoDu:
def __init__(self):
self.downloader=Downloader()
self.url_manger = URLManger()
self.paeser= Parser()
self.data_saver = DataOutput()
def run(self,url):
self.url_manger.add_new_url(url)
while self.url_manger.has_new_url():
url=self.url_manger.get_new_url()
html=self.downloader.download(url)
data,urls=self.paeser.parse(html)
self.data_saver.save(data)
self.url_manger.add_new_urls(urls)
if __name__ == '__main__':
diao_du = DiaoDu()
diao_du.run('https://www.qius')
(27)爬虫类方法综合实例
最新推荐文章于 2024-05-23 17:47:04 发布