一个爬虫项目(抓妹子的图)
url加密了,这儿用base64解密,js 自带一个token(解开图片的url地址)
话不多说 看源码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import hashlib
import base64
from bs4 import BeautifulSoup
import requests
import re
import os
import queue
import threading
import math
from multiprocessing import Pool
import sys
sys.stderr = None
'''
url解码
'''
def parse(imgHash, constant):
return decode_base64(imgHash).decode('utf8')
def md5(src):
m = hashlib.md5()
m.update(src.encode("utf8"))
return m.hexdigest()
def decode_base64(data):
missing_padding = 4 - len(data) % 4
if missing_padding:
data += '=' * missing_padding
return base64.b64decode(data)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}
'''
页面抓取类
'''
class Spider(threading.Thread):
def __init__(self, pages, proxies, url_manager):
threading.Thread.__init__(self)
self.pages = pages
self.proxies = proxies
self.url_manager = url_manager
def get_Page(self, page, proxies, url_manager):
bs_page = BeautifulSoup(page, "lxml")
'''
获取js文件地址从而得到constant常量
'''
try:
model = re.findall(r'.*<script\ssrc=\"\/\/(cdn.jandan.net\/static\/min.*?)\"><\/script>.*', page)
jsfile_url = "http://" + model[len(model) - 1] # 页面上可能有两个地址,取最后一个匹配的地址
except Exception as e:
print(e)
jsfile = requests.get(jsfile_url, headers=headers, proxies=proxies, timeout=3).text
constant = re.search(r'.*remove\(\);var\sc=\w+\(e,\"(\w+)\".*', jsfile).group(1)
'''
向parse函数传入constant常量和img-hash得到图片地址
'''
for item in bs_page.select('.img-hash'):
img_url = 'http:' + parse(item.text, constant)
url_manager.addNewUrl(img_url)
def run(self):
for page in self.pages:
self.get_Page(page, self.proxies, self.url_manager)
'''
程序入口
'''
def main(amount):
url_manager = UrlManager()
proxies = {'http': ''} # 尚未添加ip代理功能,程序已能正常运行
current_url = 'http://jandan.net/ooxx' # 当前页面url
'''
多线程抓取页面地址
'''
pages = [] # 所有待抓取页面
try:
for i in range(amount):
current_page = requests.get(current_url, headers=headers).text # 当前页面源码
pages.append(current_page)
current_url = 'http:' + re.search(r'.*Older\sComments\"\shref=\"(.*?)\"\sclass.*', current_page).group(
1) # 提取下个页面url
except Exception as e:
pass
page_threads = []
t_amount = 10 if len(pages) > 10 else len(pages) # 页面抓取线程数
for i in range(t_amount):
t = Spider(pages[math.ceil(int((len(pages)) / t_amount) * i):math.ceil(int((len(pages)) / t_amount) * (i + 1))],
proxies, url_manager)
page_threads.append(t)
for t in page_threads:
t.start()
for t in page_threads:
t.join()
img_threads = []
for i in range(10): # 固定10个线程用于下载图片
t = Download(url_manager)
img_threads.append(t)
for t in img_threads:
t.start()
for t in img_threads:
t.join()
L = threading.Lock()
'''
图片下载类
'''
class Download(threading.Thread):
def __init__(self, url_manager):
threading.Thread.__init__(self)
self.url_manager = url_manager
self.pic_headers = headers
self.pic_headers['Host'] = 'wx3.sinaimg.cn'
def download_Img(self, url):
isGif = re.match(r'(.*\.sinaimg\.cn\/)(\w+)(\/.+\.gif)', url)
if isGif:
url = isGif.group(1) + 'large' + isGif.group(3)
extensionName = re.match(r'.*(\.\w+)', url).group(1) # 图片扩展名
L.acquire()
if not os.path.exists('img'):
os.mkdir('img')
with open('img/' + str(len(os.listdir('./img'))) + extensionName, 'wb') as f:
# headers['Host']='wx3.sinaimg.cn'
f.write(requests.get(url, headers=self.pic_headers).content)
f.close()
L.release()
def run(self):
while not self.url_manager.isEmpty():
imgUrl = self.url_manager.getNewUrl()
self.download_Img(imgUrl)
self.url_manager.addOldUrl(imgUrl)
'''
url仓库,提供url更新以及记录功能
'''
class UrlManager:
def __init__(self):
self.url_used = []
self.url_target = queue.Queue()
if os.path.exists('url.txt'):
with open('url.txt', 'r') as f:
for eachline in f.readlines():
self.url_used.append(eachline.strip())
else:
open("url.txt", 'w')
def getNewUrl(self):
return self.url_target.get()
def isEmpty(self):
return self.url_target.empty()
def addNewUrl(self, newUrl):
if newUrl in self.url_used:
pass
else:
self.url_target.put(newUrl)
def addOldUrl(self, oldUrl):
self.url_used.append(oldUrl)
with open('url.txt', 'a') as f:
f.write(oldUrl + '\n')
if __name__ == '__main__':
num_list= [i for i in range(48)]
res_l = []
p = Pool()
for i in num_list:
res = p.apply_async(main, args=(int(i),))
res_l.append(res)
for k in res_l:
res = k.get()
print('下载妹子(%s)'%k)
基于多线程,多进程(并且屏蔽了所有的错误,可以在上面扩展),谢谢!