对前篇实现的python爬虫进行模块化、组件化。分为过滤器去重模块,提取链接模块,页面处理模块、登录模块和主流程模块,目前非常粗糙,有待完善,代码如下:、
主流程模块,程序入口:main.py
from collections import deque
import requests
from extracter.myextracter import myextracter
from filter.setfilter import setfilter
from handler.myhandler import myhandler
#初始化组件:队列,过滤器,链接提取器,页面处理器
queue = deque()
myfilter = setfilter()
handler = myhandler()
extracter = myextracter()
#初始化状态
init_url = "https://mm.taobao.com/"
queue.append(init_url)
myfilter.add(init_url)
file_path = 'E:/mm/'
count = 0
i = 1;
s = requests.session()
while queue:
url = queue.popleft()
print('已经抓取:' + str(count) + '个,正在抓取-->' + url)
count += 1
#下载网页准备处理
try:
r = s.get(url, timeout = 2)
r.encoding = 'UTF-8'
except:
continue
urls = extracter.extract_urls(r.text, 'a', 'href')
for x in urls:
if 'mm.taobao.com' in x and myfilter.contains(x) == False:
queue.append("https:" + x)
myfilter.add("https:" + x)
links = extracter.extract_urls(r.text, 'img', 'src')
for x in links:
print("正在保存图片" + str(i) + "-->https:" + x)
try:
handler.save_file_binary(file_path +str(i) +".jpg", s.get("https:" + x, timeout = 2).content)
i += 1
except:
continue
过滤器去重模块:setfilter.py:
# set过滤器,实现链接去重
class setfilter(object):
myfilter = set()
# 向过滤器中添加元素
def add(self, link):
self.myfilter.add(link)
# 检查是否重复
def contains(self, link):
if link in self.myfilter:
return True
else:
return False
# 清空过滤器
def clear(self,):
self.myfilter.clear()
提取链接模块:myextracter.py:
#默认的提取页面链接方法
from bs4 import BeautifulSoup
class myextracter:
#page页面数据,tag需要提取的标签如'a',attr需要提取的属性如'href'
def extract_urls(self, page, tag, attr):
urls = set()
data = BeautifulSoup(page, "html.parser")
for x in data.findAll(tag):
try:
next_url = x[attr]
urls.add(next_url)
except:
continue
return urls
页面处理模块,需用户自定义:myhandler.py:
#页面处理器,需要自定义
class myhandler:
#保存二进制文件
def save_file_binary(self, file_path, data):
with open(file_path,'wb') as f:
f.write(data)
#保存文本文件
def save_file_str(self, file_path, data):
with open(file_path,'w') as f:
f.write(data)
登录模块,用于保持session:login.py:
#自定义登录模块
class loginer:
#通过头信息伪装成火狐浏览器
headinfo = {
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
def login(self, session, url, data):
session.post(url, data, headers = self.headinfo)
return session