像读文献一样,读好的代码
文章目录
爬虫学了一段时间,进步很慢,感觉很重要的原因是只看视频,模仿学写代码,时间稍长就忘了,书读百遍其义自见,希望能通过本次行动,把自己学过的代码整理一遍,让自己的理解上一层台阶。
百度贴吧的抓取
# coding=utf-8
import requests
class TiebaSpider:
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
def get_url_list(self): # 1.构造url列表
# url_list = []
# for i in range(1000):
# url_list.append(self.url_temp.format(i*50))
# return url_list
return [self.url_temp.format(i * 50) for i in range(1000)]
def parse_url(self, url): # 发送请求,获取响应
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def save_html(self, html_str, page_num): # 保存html字符串
file_path = "{}—第{}页.html".format(self.tieba_name, page_num)
with open(file_path, "w", encoding="utf-8") as f: # "李毅—第4页.html"
f.write(html_str)
def run(self): # 实现主要逻辑
# 1.构造url列表
url_list = self.get_url_list()
# 2.遍历,发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
# 3.保存
page_num = url_list.index(url) + 1 # 页码数
self.save_html(html_str, page_num)
if __name__ == '__main__':
tieba_spider = TiebaSpider("lol")
tieba_spider.run()
百度翻译的抓取
import requests
import json
import sys
class BaiduFanyi:
def __init__(self,trans_str):
self.trans_str = trans_str
self.lang_detect_url = "http://fanyi.baidu.com/langdetect"
self.trans_url = "http://fanyi.baidu.com/basetrans"
self.headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}
def parse_url(self,url,data): #发送post请求,获取响应
response = requests.post(url,data=data,headers=self.headers)
return json.loads(response.content.decode())
def get_ret(self,dict_response):#提取翻译的结果
ret = dict_response["trans"][0]["dst"]
print("result is :",ret)
def run(self):#实现主要逻辑
#1.获取语言类型
#1.1 准备post的url地址,post_data
lang_detect_data = {"query":self.trans_str}
#1.2 发送post请求,获取响应
lang = self.parse_url(self.lang_detect_url,lang_detect_data)["lan"]
#1.3 提取语言类型
#2.准备post的数据
trans_data = {"query":self.trans_str,"from":"zh","to":"en"} if lang== "zh" else \
{"query":self.trans_str,"from":"en","to":"zh"}
#3.发送请求,获取响应
dict_response = self.parse_url(self.trans_url,trans_data)
#4.提取翻译的结果
self.get_ret(dict_response)
if __name__ == '__main__':
trans_str= sys.argv[1]
baidu_fanyi = BaiduFanyi(trans_str)
baidu_fanyi.run()
猫眼电影
import json
import re
import time
import requests
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+ '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+ '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5] + item[6]
}
# json.dump()实现字典的序列化,ensure_ascii=False保证输出结果是中文形式而非Unicode编码
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i*10)
time.sleep(1)
优化版
import json
from lxml import etree
import time
import requests
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
resp = etree.HTML(html)
dds = resp.xpath('//dl[@class="board-wrapper"]/dd')
items = []
for dd in dds:
index = dd.xpath('./i/text()')[0]
image = dd.xpath('.//img[@class="board-img"]/@data-src|//img[@class="board-img"]/@src')[0]
title = dd.xpath('.//p[@class="name"]//text()')[0]
actor = dd.xpath('.//p[@class="star"]//text()')[0].strip()[3:]
time = dd.xpath('.//p[@class="releasetime"]/text()')[0][5:]
score = ''.join(dd.xpath('.//p[@class="score"]//text()'))
item = [index, image, title, actor, time, score]
items.append(item)
return items
# pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
# + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
# + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
# items = re.findall(pattern, html)
# for item in items:
# yield {
# 'index': item[0],
# 'image': item[1],
# 'title': item[2],
# 'actor': item[3].strip()[3:],
# 'time': item[4].strip()[5:],
# 'score': item[5] + item[6]
# }
# json.dump()实现字典的序列化,ensure_ascii=False保证输出结果是中文形式而非Unicode编码
def write_to_file(content):
with open('result.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main():
url_list = ['http://maoyan.com/board/4?offset={}'.format(i * 10) for i in range(10)]
for url in url_list:
html = get_one_page(url)
items = parse_one_page(html)
print(items)
write_to_file(items)
# break
if __name__ == '__main__':
main()
# time.sleep(1)
头条街拍
import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool
def get_page(offset):
params = {
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from': 'search_tab'
}
base_url = 'https://www.toutiao.com/search_content/?'
url = base_url + urlencode(params)
try:
resp = requests.get(url)
if codes.ok == resp.status_code:
return resp.json()
except requests.ConnectionError:
return None
def get_images(json):
if json.get('data'):
data = json.get('data')
for item in data:
if item.get('cell_type') is not None:
continue
title = item.get('title')
images = item.get('image_list')
for image in images:
yield {
'image': 'https:' + image.get('url'),
'title': title
}
def save_image(item):
img_path = 'img' + os.path.sep + item.get('title')
# makedirs创建递归目录树
if not os.path.exists(img_path):
os.makedirs(img_path)
try:
url = item.get('image').replace('list', 'large')
resp = requests.get(url)
if codes.ok == resp.status_code:
# md5相当于指纹密码,每个文件都不一样, file_suffix文件后缀
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=md5(resp.content).hexdigest(),
file_suffix='jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(resp.content)
print('Downloaded image path is %s' % file_path)
else:
print('Already Downloaded', file_path)
except requests.ConnectionError:
print('Failed to Save Image,item %s' % item)
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
GROUP_START = 0
GROUP_END = 7
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
知乎热门txt保存
import requests
from pyquery import PyQuery as pq
url = 'https://www.zhihu.com/explore'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
}
html = requests.get(url, headers=headers).text
doc = pq(html)
items = doc('.explore-tab .feed-item').items()
for item in items:
question = item.find('h2').text()
print(question)
author = item.find('.author-link-line').text()
print(author)
answer = pq(item.find('.content').html()).text()
print(answer)
with open('explore.txt', 'a', encoding='utf-8') as f:
f.write('\n'.join([question, author, answer]))
f.write('\n' + '=' * 50 + '\n')
# file = open('explore.txt', 'a', encoding='utf-8')
# file.write('\n'.join([question, author, answer]))
# file.write('\n' + '='*50 + '\n')
# file.close()
崔大神微博爬取
from urllib.parse import urlencode
import requests
from pymongo import MongoClient
from pyquery import PyQuery as pq
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
'Host': 'm.weibo.cn',
'Referer': 'https://m.weibo.cn/u/2830678474',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/68.0.3440.106 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
client = MongoClient()
db = client.weibo
collection = db.weibo
max_page = 10
def get_page(page):
params={
'type': 'uid',
'value': '2830678474',
'containerid': '1076032830678474',
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json(), page
except requests.ConnectionError as e:
print('Error', e.args)
def parse_page(json, page:int):
if json:
items = json.get('data').get('cards')
for index, item in enumerate(items):
if page==1 and index==1:
continue
else:
item = item.get('mblog')
weibo = {}
weibo['id'] = item.get('id')
# 键text对应的值为html,需要pyquery解析
weibo['text'] = pq(item.get('text')).text()
weibo['attitudes'] = item.get('attitudes_count')
weibo['comments'] = item.get('comments_count')
weibo['reposts'] = item.get('reposts_count')
yield weibo
def save_to_mongo(result):
if collection.insert_one(result):
print('Save to Mongo')
if __name__ == "__main__":
for page in range(1, max_page+1):
json = get_page(page)
results = parse_page(*json)
for result in results:
print(result)
save_to_mongo(result)
淘宝美食 selenium抓取
import re
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
# chromeheadless做浏览器的要求
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# browser = webdriver.Chrome(chrome_options=chrome_options)
# 用PhantomJS作为浏览器
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
wait = WebDriverWait(browser, 10)
# 一般需要设置浏览器窗口大小
browser.set_window_size(1400, 900)
def search():
print('正在搜索...')
try:
browser.get('https://www.taobao.com')
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
input.send_keys(KEYWORD)
submit.click()
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
get_products()
return total.text
except TimeoutException:
return search()
def next_page(page_number):
print('正在翻页', page_number)
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
submit = wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page_number)
submit.click()
wait.until(EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)
))
get_products()
except TimeoutException:
next_page(page_number)
def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('data-src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('shop').text(),
'location': item.find('.location').text()
}
save_to_mongo(product)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存储到MONGODB成功', result)
except Exception:
print('存储到MONGODB失败', result)
def main():
"""try/finally架构保证最后正常关闭浏览器,Exception保证忽略异常"""
try:
total = search()
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total + 1):
next_page(i)
except Exception:
print('出错啦')
finally:
browser.close()
if __name__ == "__main__":
main()
代理池的构建
run.py
from proxypool.api import app
from proxypool.schedule import Schedule
def main():
s = Schedule()
s.run()
app.run()
if __name__ == '__main__':
main()
api.py
from flask import Flask, g
from .db import RedisClient
__all__ = ['app']
app = Flask(__name__)
def get_conn():
"""
Open a new redis connection if there is none yet for the current application context
"""
if not hasattr(g, 'redis_client'):
g.redis_client = RedisClient()
return g.redis_client
@app.route('/')
def index():
return '<h2>Welcome to Proxy Pool System</h2>'
@app.route('/get')
def get_proxy():
"""
Get a proxy
"""
conn = get_conn()
return conn.pop()
@app.route('/count')
def get_counts():
"""
Get the count of proxies
"""
conn = get_conn()
return str(conn.queue_len)
if __name__ == '__main__':
app.run()
shcedule.py
import aiohttp
import asyncio
from proxypool.db import RedisClient
from proxypool.error import ResourceDepletionError
from proxypool.getter import FreeProxyGetter
from proxypool.setting import *
import time
from multiprocessing import Process
try:
from aiohttp.errors import ProxyConnectionError, ServerDisconnctedError, ClientResponseError
except:
from aiohttp import ClientProxyConnectionError as ProxyConnectionError, ServerDisconnectedError, ClientResponseError, ClientConnectionError
class ValidityTester(object):
test_api = TEST_API
def __init__(self):
self._raw_proxies = None
self._usable_proxies = []
def set_raw_proxies(self, proxies):
self._raw_proxies = proxies
self._conn = RedisClient()
async def test_single_proxy(self, proxy):
"""
text one proxy, if valid, put them to usable_proxies
"""
try:
async with aiohttp.ClientSession() as session:
try:
if isinstance(proxy, bytes):
# 设置类型,需要转码
proxy = proxy.decode('utf-8')
real_proxy = 'http://' + proxy
print('Testing', proxy)
async with session.get(self.test_api, proxy=real_proxy, timeout=get_proxy_timeout) as response:
if response.status == 200:
self._conn.put(proxy)
print('Valid proxy', proxy)
except (ProxyConnectionError, TimeoutError, ValueError):
print('Invalid proxy', proxy)
except (aiohttp.ServerDisconnectedError, aiohttp.ClientResponseError, aiohttp.ClientConnectorError) as s:
print(s)
pass
def test(self):
"""
aio test all proxies
"""
print('ValidityTester is working')
try:
loop = asyncio.get_event_loop()
tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
loop.run_until_complete(asyncio.wait(tasks))
except ValueError:
print('Async Error')
class PoolAdder(object):
"""
add proxy to pool
"""
def __init__(self, threshold):
self._threshold = threshold
self._conn = RedisClient()
self._tester = ValidityTester()
self._crawler = FreeProxyGetter()
def is_over_threshold(self):
"""
judge if count is overflow
"""
if self._conn.queue_len >= self._threshold:
return True
else:
return False
def add_to_queue(self):
print('PoolAdder is working')
proxy_count = 0
while not self.is_over_threshold():
for callback_lable in range(self._crawler.__CrawlFuncCount__):
callback = self._crawler.__CrawlFunc__[callback_lable]
raw_proxies = self._crawler.get_raw_proxies(callback)
# test crawled proxies
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
proxy_count += len(raw_proxies)
if self.is_over_threshold():
print('IP is enough, waiting to be used')
break
if proxy_count ==0:
raise ResourceDepletionError
class Schedule(object):
@staticmethod
def valid_proxy(cycle=VALID_CHECK_CYCLE):
"""
Get half of proxies which in redis
"""
conn = RedisClient()
tester = ValidityTester()
while True:
print('Refreshing ip')
count = int(0.5 * conn.queue_len)
if count == 0:
print('Waiting for adding')
time.sleep(cycle)
continue
raw_proxies = conn.get(count)
tester.set_raw_proxies(raw_proxies)
tester.test()
time.sleep(cycle)
@staticmethod
def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
upper_threshold=POOL_UPPER_THRESHOLD,
cycle=POOL_LEN_CHECK_CYCLE):
"""
If the number of proxies less than lower_threshold, add proxy
} """
conn = RedisClient()
adder = PoolAdder(upper_threshold)
while True:
if conn.queue_len < lower_threshold:
adder.add_to_queue()
time.sleep(cycle)
def run(self):
print('IP processing running')
valid_process = Process(target=Schedule.valid_proxy)
check_process = Process(target=Schedule.check_pool)
valid_process.start()
check_process.start()
getter.py
from .utils import get_page
from pyquery import PyQuery as pq
import re
class ProxyMetaclass(type):
"""
元类,在FreeProxyGetter类中加入__CrawlFunc__和__CrawlFuncCount__两个参数,
分别表示爬虫函数和爬虫函数的数量
"""
def __new__(cls, name, bases, attrs):
count = 0
attrs['__CrawlFunc__'] = []
for k,v in attrs.items():
if 'crawl_' in k:
attrs['__CrawlFunc__'].append(k)
count += 1
attrs['__CrawlFuncCount__'] = count
return type.__new__(cls, name, bases, attrs)
class FreeProxyGetter(object, metaclass=ProxyMetaclass):
def get_raw_proxies(self, callback):
proxies = []
print('Callback', callback)
for proxy in eval("self.{}()".format(callback)):
print('Getting', proxy, 'from', callback)
proxies.append(proxy)
return proxies
def crawl_kuaidaili(self):
for page in range(1,4):
# 国内高匿代理
start_url = 'https://www.kuaidaili.com/free/inha/{}/'
html = get_page(start_url)
ip_address = re.compile(
'<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>'
)
re_ip_address = ip_address.findall(str(html))
for address, port in re_ip_address:
result = address + ":" + port
yield result.replace(' ', '')
def crawl_xicidaili(self):
for page in range(1,4):
start_url = 'http://www.xicidaili.com/nn/{}'.format(page)
html = get_page(start_url)
ip_address = re.compile(
'<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" '
'alt="Cn"></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>'
)
# \s* 匹配空格,起到换行作用
re_ip_address = ip_address.findall(str(html))
for address, port in re_ip_address:
result = address + ':' + port
yield result.replace(' ', '')
def crawl_daili66(self, page_count=4):
start_url = 'http://www.66ip.cn/{}.html'
urls = [start_url.format(page) for page in range(1, page_count+1)]
for url in urls:
print('Crawling', url)
html = get_page(url)
if html:
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
ip = tr.find('td:nth-child(1)').text()
port = tr.find('td:nth-child(2)').text()
yield ':'.join([ip, port])
def crawl_data5u(self):
for i in ['gngn', 'gnpt']:
start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i)
html = get_page(start_url)
ip_address = re.compile(
'<ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;">'
'<li class=".*">(.*?)</li></span>'
)
# \s * 匹配空格,起到换行作用
re_ip_address = ip_address.findall(str(html))
for address, port in re_ip_address:
result = address + ":" + port
yield result.replace(' ', '')
def crawl_kxdaili(self):
for i in range(1,4):
start_url = 'http://www.ip.kxdaili.com/dailiip/1/{}.html#ip'.format(i)
html = get_page(start_url)
ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
# \s *匹配空格,起到换行作用
re_ip_address = ip_address.findall(str(html))
for address, port in re_ip_address:
result = address + ":" + port
yield result.replace(' ', '')
# proxy = FreeProxyGetter()
# proxy.get_raw_proxies()
db.py
import redis
from proxypool.error import PoolEmptyError
from proxypool.setting import HOST, PORT, PASSWORD
class RedisClient(object):
def __init__(self, host=HOST, port=PORT):
if PASSWORD:
self._db = redis.Redis(host=host, port=port, password=PASSWORD)
else:
self._db = redis.Redis(host=HOST, port=PORT)
def get(self, count=1):
"""
从左侧批量拿出代理,左侧为老化的代理,右侧为更新的
"""
proxies = self._db.lrange("proxies", 0, count - 1)
# 对列表进行修剪trim,只保留区间内的值
self._db.ltrim("proxies", count, -1)
return proxies
def put(self, proxy):
"""
向右侧添加元素
"""
self._db.rpush("proxies", proxy)
def pop(self):
"""
供API调用的,get proxy from right
返回并删除队列的尾元素
"""
try:
return self._db.rpop("proxies").decode('utf-8')
except:
raise PoolEmptyError
# 装饰器
@property
def queue_len(self):
"""
get length from queue
"""
return self._db.llen("proxies")
def flush(self):
"""
刷新整个队列
"""
self._db.flushall()
if __name__ == '__main__':
conn = RedisClient()
print(conn.pop())
settings.py
# Redis数据库的地址和端口
HOST = 'localhost'
PORT = 6379
# 如果Redis有密码,则添加这句密码,否则设置为None或''
PASSWORD = ''
# 获得代理测试时间界限
get_proxy_timeout = 9
# 代理池数量限制
POOL_LOWER_THRESHOLD = 20
POOL_UPPER_THRESHOLD = 100
# 检查周期
VALID_CHECK_CYCLE = 60
POOL_LEN_CHECK_CYCLE = 20
# 测试API,用百度来测试
TEST_API = 'http://www.fang.com/SoufunFamily.htm'
error.py
class ResourceDepletionError(Exception):
def __init__(self):
Exception.__init__(self)
def __str__(self):
return repr('The proxy source is exhausted')
class PoolEmptyError(Exception):
def __init__(self):
Exception.__init__(self)
def __str__(self):
return repr('The proxy pool is empty')
搜狗微信文章爬取
run.py
from weixin.spider import Spider
if __name__ == '__main__':
spider = Spider()
spider.run()
spider.py
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import requests
from requests import ReadTimeout, Session
from weixin.config import PROXY_POOL_URL, VALID_STATUS, MAX_FAILED_TIME, KEYWORD
from weixin.db import RedisQueue
from weixin.mysql import MySQL
from weixin.request import WeixinRequest
class Spider():
base_url = 'https://weixin.sougou.com/weixin?'
keyword = KEYWORD
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Cookie': 'CXID=68999D20535A955E54EEB369EEBDAA87; SUID=7D0481DF3565860A5B922DAB00041476; '
'SUV=00724ADFDF81047D5B9390FE3CE03520; ad=Ukllllllll2b6ALrlllllVmUX@1lllllTc99Kyllll'
'9llllljylll5@@@@@@@@@@; IPLOC=CN5101; ABTEST=0|1536564030|v1; weixinIndexVisited=1; '
'SNUID=6AEE6B35EBEE9D9F5957A098EBEC0DF0; sct=1; JSESSIONID=aaaqsTn37HldSeg_akWyw; '
'ppinf=5|1538793682|1540003282|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo0NTol'
'RTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8Y3J0OjEwOjE1Mzg3OTM2ODJ8cmVm'
'bmljazo0NTolRTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8dXNlcmlkOjQ0Om85d'
'DJsdURabHBHRjJ1TF9vbGtrV01MbTlHWFFAd2VpeGluLnNvaHUuY29tfA; pprdig=YXVgbs0p9dU4aBgDw7V_id'
'ljKjCcGiXgeUpafLd_FO65GO0AMS3VWq_ogoKBR7XpAChV9r3DxwwMN_lwgpTwjbT4al7JXyKKOua-q3IoMvfo2KwI1'
'sXoNQKlyuxomXov9kuvMJkAHq4x6HCYOtsNhkW92H_acgTIeDo65hnDIbc; sgid=15-37413245-AVu4ININKITuO'
'1IBrovHceA; ppmdig=153880606700000019649cd69fcbff1cb91d0c6884906b6b; LSTMV=469%2C259; LCLKINT=5007',
'Host': 'weixin.sogou.com',
'Upgrade-Insecure-Requests': '1',
# 为了对付防盗链,对方服务器会事变header中的Referer是不是自己的,所以我们会在头部中加上Referer
'Referer': 'https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=17&ie=utf8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
# 初始化Session和RedisQueue MySQL对象,分别执行请求、代理调用、存储要求
session = Session()
queue = RedisQueue()
mysql = MySQL()
def get_proxy(url):
"""
从代理池中获取代理
"""
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
print('Get Proxy', response.text)
return response.text
return None
except requests.ConnectionError:
return None
def start(self):
"""
初始化工作
"""
# 全局更新Headers,使得所有请求都能应用Cookies
self.session.headers.update(self.headers)
# 起始URL的构造
start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
# 构造WeixinRequest对象,回调函数:请求成功后用parse_index()处理和解析 need_proxy参数执行请求须用代理
weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
# 请求加入队列,调度第一个请求
self.queue.add(weixin_request)
def parse_index(self, response):
"""
解析索引页
:param response: 响应
:return: 新的响应
"""
doc = pq(response.text)
# 获取本页所有的微信文章链接
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
url = item.attr('href')
# 构造成WeixinRequest之后yield返回
weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
yield weixin_request
# 获取下一页的链接
next = doc('#sogou_next').attr('href')
if next:
url = self.base_url + str(next)
# 构造成WeixinRequest之后yield返回
weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
yield weixin_request
def parse_detail(self, response):
"""
解析详情页
:param response: 响应
:return: 微信公众号文章
"""
doc = pq(response.text)
# 提取标题、正文文本、发布日期、发布人昵称、公众号名称,组合成字典返回
data = {
'title': doc('.rich_media_title').text(),
'content': doc('.rich_media_content').text(),
'date': doc('#publish_time').text(),
'nickname': doc('#meta_content > span.rich_media_meta.rich_media_meta_text').text(),
'wechat': doc('#profileBt > #js_name').text()
}
yield data
# 返回之后需要判断类型,字典类型调用mysql对象的insert()方法存入数据库
def request(self, weixin_request):
"""
执行请求
:param weixin_request: 请求
:return: 响应
"""
try:
# 先判断请求是否需要代理,调用Session的send()方法执行请求
if weixin_request.need_proxy:
proxy = self.get_proxy()
if proxy:
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy
}
return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout,
allow_redirects=False, proxies=proxies)
# 请求调用prepare()方法转化为Prepared Request,不重定向,请求超时时间,响应返回
return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
except (ConnectionError, ReadTimeout) as e:
print(e.args)
return False
def error(self, weixin_request):
"""
错误处理
"""
weixin_request.fail_time = weixin_request.fail_time + 1
print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
if weixin_request.fail_time < MAX_FAILED_TIME:
self.queue.add(weixin_request)
def schedule(self):
"""
调度请求,schedule()方法,内部是一个循环,条件:队列不为空
"""
while not self.queue.empty:
# 调用pop()方法取出下一个请求,request()方法执行请求
# 第一次循环结束,while继续执行,队列包含第一页内容的文章详情页请求和下一页的请求,
# 第二次循环得到的下一个请求是文章详情页的请求,重新调用request()方法获得响应,对应回调函数parse_detail()
weixin_request = self.queue.pop()
callback = weixin_request.callback
print('Schedule', weixin_request.url)
response = self.request(weixin_request)
# request()方法得到Response对象的状态码合法判断,调用WeixinRequest的回调函数(parse_index())解析
if response and response.status_code in VALID_STATUS:
results = list(callback(response))
# schedule()方法将返回结果遍历,利用isinstance()方法判断返回结果
if results:
for result in results:
print('New Result', result)
# 判断类型是否相同
if isinstance(result, WeixinRequest):
self.queue.add(result)
if isinstance(result, dict):
self.mysql.insert('articles', result)
else:
self.error(weixin_request)
else:
self.error(weixin_request)
def run(self):
"""
入口
:return:
"""
self.start()
self.schedule()
if __name__ == '__main__':
spider = Spider()
spider.run()
request.py
from requests import Request
from weixin.config import TIMEOUT
class WeixinRequest(Request):
def __init__(self, url, callback, method='GET', headers=None, need_proxy=False, fail_time=0, timeout=TIMEOUT):
Request.__init__(self, method, url, headers)
self.callback = callback
self.need_proxy = need_proxy
self.fail_time = fail_time
self.timeout = timeout
db.py
from pickle import dumps, loads
from redis import StrictRedis
from weixin.config import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_KEY
from weixin.request import WeixinRequest
class RedisQueue():
def __init__(self):
"""
初始化Redis
"""
self.db = StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)
def add(self, request):
"""
向队列添加序列化后的Request
:param request: 请求对象
:return: 添加结果
"""
if isinstance(request, WeixinRequest):
# 用pickle的dumps()方法序列化request,再用rpush加入队列
return self.db.rpush(REDIS_KEY, dumps(request))
return False
def pop(self):
"""
取出下一个Request并反序列化
:return: Reqiest or None
"""
if self.db.llen(REDIS_KEY):
# 调用lpop()方法将请求从队列中取出,再用pickle的loads()方法转化为WeixinRequest对象
return loads(self.db.lpop(REDIS_KEY))
else:
return False
def clear(self):
self.db.delete(REDIS_KEY)
def empty(self):
return self.db.llen(REDIS_KEY) == 0
if __name__ == '__main__':
db = RedisQueue()
start_url = 'http://www.baidu.com'
weixin_request = WeixinRequest(url=start_url, callback='hello', need_proxy=True)
db.add(weixin_request)
request = db.pop()
print(request)
print(request.callback, request.need_proxy)
mysql.py
import pymysql
from weixin.config import *
class MySQL():
def __init__(self, host=MYSQL_HOST, username=MYSQL_USER, password=MYSQL_PASSWORD, port=MYSQL_PORT,
database=MYSQL_DATABASE):
"""
MySQL初始化
:param host: 用于指定请求资源的主机IP和端口号,内容为请求URL的原始服务器或网关位置
:param username:
:param password:
:param port:
:param database:
"""
try:
# connect()方法声明一个MySQL连接对象db
self.db = pymysql.connect(host, username, password, database, charset='utf8', port=port)
# 连接成功调用cursor()方法获得MySQL的操作游标,利用游标执行SQL语句
self.cursor = self.db.cursor()
except pymysql.MySQLError as e:
print(e.args)
def insert(self, table, data):
"""
插入数据
:param table:
:param data:
:return:
"""
keys = '.'.join(data.keys())
values = '.'.join(['%s'] * len(data))
# 构造SQL语句,value值 格式化%s实现,再用统一的元祖传到execute()方法里
sql_query = 'insert into %s (%s) values (%s)'%(table, keys, values)
try:
self.cursor.execute(sql_query, tuple(data.values()))
# commit是真正将语句提交到数据库执行的方法
self.db.commit()
except pymysql.MySQLError as e:
print(e.args)
# 异常处理,执行失败,用rollback()执行数据回滚,事务机制确保数据一致性
self.db.rollback()
config.py
PROXY_POOL_URL = 'http://127.0.0.1:5555/random'
VALID_STATUS = [200]
TIMEOUT = 10
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_PASSWORD = ''
REDIS_KEY = 'weixin'
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = ''
MYSQL_DATABASE = 'weixin'
MAX_FAILED_TIME = 20
KEYWORD = '风景'