1.爬虫构成
url管理、html下载、html解析、数据存储+爬虫管理
2.爬虫管理
利用 url管理、html下载、html解析、数据存储 实现爬取过程控制
#coding:utf-8
import UrlManager,HtmlDownload,HtmlParse,DataSave
class ReptileManager():
def __init__(self):
self.UrlManager=UrlManager.UrlManager()
self.HtmlDownload=HtmlDownload.HtmlDownload()
self.HtmlParse=HtmlParse.HtmlParse()
self.DataSave=DataSave.DataSave()
def begin(self,root_url,count=50):
self.UrlManager.put_url(root_url)
i=1
while self.UrlManager.has_new_urls() and self.UrlManager.has_old_urls()<count:
new_url=self.UrlManager.get_url()
html=self.HtmlDownload.download(new_url)
print('[*]%d:parseing...%s'%(i,new_url))
urls,datas=self.HtmlParse.parse(new_url,html)
self.UrlManager.put_urls(urls)
self.DataSave.store_data(datas)
i+=1
self.UrlManager.dump_local()
print('mission success!')
if __name__=='__main__':
reptile=ReptileManager()
root_url=r'https://www.biquge5200.cc/0_916/699498.html'
print("root url:",root_url)
reptile.begin(root_url,count=10)
2.url管理
维护new url 和使用过的old url set集合,并进行本地的二进制保存和加载
from _pickle import*
class UrlManager():
def __init__(self):
self.new_urls=set()
self.old_urls=set()
self.load_local()
def get_url(self):
url=self.new_urls.pop()
self.old_urls.add(url)
return url
def put_url(self,url):
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def put_urls(self,urls):
for url in urls:
if url is None:
continue
self.put_url(url)
def has_new_urls(self):
for url in self.new_urls:
print("new url:",url)
return len(self.new_urls)!=0
def has_old_urls(self):
return len(self.old_urls)
def dump_local(self):
with open('local_new_urls.txt','wb') as f:
dump(self.new_urls,f)
with open('local_old_urls.txt','wb') as f:
dump(self.old_urls,f)
def load_local(self):
try:
with open('local_new_urls.txt','rb') as f:
self.new_urls=load(f)
with open('local_old_urls.txt','rb') as f:
self.old_urls=load(f)
except:
print('local_file is null!')
3.html页面下载
#coding:utf-8
import requests
class HtmlDownload():
def download(self,url):
if url is None:
return None
user_agent="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
header={'User-Agent':user_agent}
proxies={"http":"http://125.120.11.219:6666"}
r=requests.get(url,headers=header,proxies=proxies)
# r=requests.get(url)
r.raise_for_status()
if r.status_code==200:
r.endcoding='utf-8'
return r.text
else:
return None
4.html页面解析
从下载的页面中解析出符合条件的new url和需要的数据
#coding:utf-8
import re,urllib
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class HtmlParse():
def parse(self,page_url,r_text):
if page_url is None or r_text is None:
print('warring:no url')
soup=BeautifulSoup(r_text,'html.parser')
new_urls=self.extract_urls(page_url,soup,r_text)
data=self.extract_datas(soup,r_text)
return new_urls,data
def extract_urls(self,page_url,soup,r_text):
new_urls=set()
#解析筛选
links=soup.find_all(name='a', attrs={'href':re.compile(r'^https://www.biquge5200.cc/0_916/.+')})
if len(links)==0:
print('[*]parse failed!')
for link in links:
#new_url=urljoin(page_url,link['href'])
#new_urls.add(new_url)
new_urls.add(link['href'])
return new_urls
def extract_datas(self,soup,r_text):
#根据查找内容返回不同类型
datas=soup.find_all('p')
return datas
5.数据保存
#coding:utf-8
class DataSave():
def __init__(self):
pass
def store_data(self,datas):
with open('book.txt','a') as f:
for data in datas:
f.write(data.getText())
f.writelines('\n')
6.测试
运行ReptileManager.py