码上行动1.1

像读文献一样,读好的代码
在这里插入图片描述


爬虫学了一段时间,进步很慢,感觉很重要的原因是只看视频,模仿学写代码,时间稍长就忘了,书读百遍其义自见,希望能通过本次行动,把自己学过的代码整理一遍,让自己的理解上一层台阶。

百度贴吧的抓取

# coding=utf-8
import requests


class TiebaSpider:
    def __init__(self, tieba_name):
        self.tieba_name = tieba_name
        self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}

    def get_url_list(self):  # 1.构造url列表
        # url_list = []
        # for i in range(1000):
        #     url_list.append(self.url_temp.format(i*50))
        # return url_list
        return [self.url_temp.format(i * 50) for i in range(1000)]

    def parse_url(self, url):  # 发送请求,获取响应
        print(url)
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    def save_html(self, html_str, page_num):  # 保存html字符串
        file_path = "{}—第{}页.html".format(self.tieba_name, page_num)
        with open(file_path, "w", encoding="utf-8") as f:  # "李毅—第4页.html"
            f.write(html_str)

    def run(self):  # 实现主要逻辑
        # 1.构造url列表
        url_list = self.get_url_list()
        # 2.遍历,发送请求,获取响应
        for url in url_list:
            html_str = self.parse_url(url)
            # 3.保存
            page_num = url_list.index(url) + 1  # 页码数
            self.save_html(html_str, page_num)


if __name__ == '__main__':
    tieba_spider = TiebaSpider("lol")
    tieba_spider.run()

百度翻译的抓取

import requests
import json
import sys

class BaiduFanyi:
    def __init__(self,trans_str):
        self.trans_str = trans_str
        self.lang_detect_url = "http://fanyi.baidu.com/langdetect"
        self.trans_url = "http://fanyi.baidu.com/basetrans"
        self.headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}

    def parse_url(self,url,data): #发送post请求,获取响应
        response = requests.post(url,data=data,headers=self.headers)
        return json.loads(response.content.decode())

    def get_ret(self,dict_response):#提取翻译的结果
        ret = dict_response["trans"][0]["dst"]
        print("result is :",ret)


    def run(self):#实现主要逻辑
        #1.获取语言类型
            #1.1 准备post的url地址,post_data
        lang_detect_data = {"query":self.trans_str}
            #1.2 发送post请求,获取响应
        lang = self.parse_url(self.lang_detect_url,lang_detect_data)["lan"]
            #1.3 提取语言类型
        #2.准备post的数据
        trans_data = {"query":self.trans_str,"from":"zh","to":"en"} if lang== "zh" else \
            {"query":self.trans_str,"from":"en","to":"zh"}
        #3.发送请求,获取响应
        dict_response = self.parse_url(self.trans_url,trans_data)
        #4.提取翻译的结果
        self.get_ret(dict_response)


if __name__ == '__main__':
    trans_str= sys.argv[1]
    baidu_fanyi = BaiduFanyi(trans_str)
    baidu_fanyi.run()

猫眼电影

import json
import re
import time

import requests
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'
        }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None



def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]
        }

# json.dump()实现字典的序列化,ensure_ascii=False保证输出结果是中文形式而非Unicode编码
def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')

def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html =  get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    for i in range(10):
        main(offset=i*10)
        time.sleep(1)

优化版

import json
from lxml import etree
import time

import requests
from requests.exceptions import RequestException


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/69.0.3497.100 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def parse_one_page(html):
    resp = etree.HTML(html)
    dds = resp.xpath('//dl[@class="board-wrapper"]/dd')
    items = []
    for dd in dds:
        index = dd.xpath('./i/text()')[0]
        image = dd.xpath('.//img[@class="board-img"]/@data-src|//img[@class="board-img"]/@src')[0]
        title = dd.xpath('.//p[@class="name"]//text()')[0]
        actor = dd.xpath('.//p[@class="star"]//text()')[0].strip()[3:]
        time = dd.xpath('.//p[@class="releasetime"]/text()')[0][5:]
        score = ''.join(dd.xpath('.//p[@class="score"]//text()'))
        item = [index, image, title, actor, time, score]
        items.append(item)
    return items

    # pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
    #                      + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
    #                      + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    # items = re.findall(pattern, html)
    # for item in items:
    #     yield {
    #         'index': item[0],
    #         'image': item[1],
    #         'title': item[2],
    #         'actor': item[3].strip()[3:],
    #         'time': item[4].strip()[5:],
    #         'score': item[5] + item[6]
    #     }


# json.dump()实现字典的序列化,ensure_ascii=False保证输出结果是中文形式而非Unicode编码
def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main():
    url_list = ['http://maoyan.com/board/4?offset={}'.format(i * 10) for i in range(10)]
    for url in url_list:
        html = get_one_page(url)
        items = parse_one_page(html)
        print(items)
        write_to_file(items)
        # break


if __name__ == '__main__':
    main()
    # time.sleep(1)

头条街拍

import requests
from urllib.parse import urlencode
from requests import codes
import os
from hashlib import md5
from multiprocessing.pool import Pool


def get_page(offset):
    params = {
        'offset': offset,
        'format': 'json',
        'keyword': '街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab'
    }
    base_url = 'https://www.toutiao.com/search_content/?'
    url = base_url + urlencode(params)
    try:
        resp = requests.get(url)
        if codes.ok == resp.status_code:
            return resp.json()
    except requests.ConnectionError:
        return None


def get_images(json):
    if json.get('data'):
        data = json.get('data')
        for item in data:
            if item.get('cell_type') is not None:
                continue
            title = item.get('title')
            images = item.get('image_list')
            for image in images:
                yield {
                    'image': 'https:' + image.get('url'),
                    'title': title
                }


def save_image(item):
    img_path = 'img' + os.path.sep + item.get('title')
    # makedirs创建递归目录树
    if not os.path.exists(img_path):
        os.makedirs(img_path)
    try:
        url = item.get('image').replace('list', 'large')
        resp = requests.get(url)
        if codes.ok == resp.status_code:
            # md5相当于指纹密码,每个文件都不一样, file_suffix文件后缀
            file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
                file_name=md5(resp.content).hexdigest(),
                file_suffix='jpg')
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    f.write(resp.content)
                print('Downloaded image path is %s' % file_path)
            else:
                print('Already Downloaded', file_path)
    except requests.ConnectionError:
        print('Failed to Save Image,item %s' % item)


def main(offset):
    json = get_page(offset)
    for item in get_images(json):
        print(item)
        save_image(item)


GROUP_START = 0
GROUP_END = 7

if __name__ == '__main__':
    pool = Pool()
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups)
    pool.close()
    pool.join()

知乎热门txt保存

import requests
from pyquery import PyQuery as pq

url = 'https://www.zhihu.com/explore'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
}
html = requests.get(url, headers=headers).text
doc = pq(html)
items = doc('.explore-tab .feed-item').items()
for item in items:
    question = item.find('h2').text()
    print(question)
    author = item.find('.author-link-line').text()
    print(author)
    answer = pq(item.find('.content').html()).text()
    print(answer)
with open('explore.txt', 'a', encoding='utf-8') as f:
    f.write('\n'.join([question, author, answer]))
    f.write('\n' + '=' * 50 + '\n')
#    file = open('explore.txt', 'a', encoding='utf-8')
#    file.write('\n'.join([question, author, answer]))
#    file.write('\n' + '='*50 + '\n')
#    file.close()

崔大神微博爬取

from urllib.parse import urlencode
import requests
from pymongo import MongoClient
from pyquery import PyQuery as pq


base_url = 'https://m.weibo.cn/api/container/getIndex?'

headers = {
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/2830678474',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/68.0.3440.106 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}
client = MongoClient()
db = client.weibo
collection = db.weibo
max_page = 10


def get_page(page):
    params={
        'type': 'uid',
        'value': '2830678474',
        'containerid': '1076032830678474',
        'page': page
    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json(), page
    except requests.ConnectionError as e:
        print('Error', e.args)


def parse_page(json, page:int):
    if json:
        items = json.get('data').get('cards')
        for index, item in enumerate(items):
            if page==1 and index==1:
                continue
            else:
                item = item.get('mblog')
                weibo = {}
                weibo['id'] = item.get('id')
                # 键text对应的值为html,需要pyquery解析
                weibo['text'] = pq(item.get('text')).text()
                weibo['attitudes'] = item.get('attitudes_count')
                weibo['comments'] = item.get('comments_count')
                weibo['reposts'] = item.get('reposts_count')
                yield weibo

def save_to_mongo(result):
    if collection.insert_one(result):
        print('Save to Mongo')


if __name__ == "__main__":
    for page in range(1, max_page+1):
        json = get_page(page)
        results = parse_page(*json)
        for result in results:
            print(result)
            save_to_mongo(result)

淘宝美食 selenium抓取

import re
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from config import *
import pymongo


client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]


# chromeheadless做浏览器的要求
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# browser = webdriver.Chrome(chrome_options=chrome_options)

# 用PhantomJS作为浏览器
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
wait = WebDriverWait(browser, 10)
# 一般需要设置浏览器窗口大小
browser.set_window_size(1400, 900)


def search():
    print('正在搜索...')
    try:
        browser.get('https://www.taobao.com')
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
        submit = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button')))
        input.send_keys(KEYWORD)
        submit.click()
        total = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total')))
        get_products()
        return total.text
    except TimeoutException:
        return search()

def next_page(page_number):
    print('正在翻页', page_number)
    try:
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
        )
        submit = wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
        input.clear()
        input.send_keys(page_number)
        submit.click()
        wait.until(EC.text_to_be_present_in_element(
            (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number)
        ))
        get_products()
    except TimeoutException:
        next_page(page_number)

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
    html = browser.page_source
    doc = pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image': item.find('.pic .img').attr('data-src'),
            'price': item.find('.price').text(),
            'deal': item.find('.deal-cnt').text()[:-3],
            'title': item.find('.title').text(),
            'shop': item.find('shop').text(),
            'location': item.find('.location').text()
        }
        save_to_mongo(product)

def save_to_mongo(result):
    try:
        if db[MONGO_TABLE].insert(result):
            print('存储到MONGODB成功', result)
    except Exception:
        print('存储到MONGODB失败', result)

def main():
    """try/finally架构保证最后正常关闭浏览器,Exception保证忽略异常"""
    try:
        total = search()
        total = int(re.compile('(\d+)').search(total).group(1))
        for i in range(2, total + 1):
            next_page(i)
    except Exception:
        print('出错啦')
    finally:
        browser.close()
if __name__ == "__main__":
    main()

代理池的构建

run.py

from proxypool.api import app
from proxypool.schedule import Schedule

def main():
    s = Schedule()
    s.run()
    app.run()

if __name__ == '__main__':
    main()

api.py

from flask import Flask, g
from .db import RedisClient

__all__ = ['app']

app = Flask(__name__)

def get_conn():
    """
    Open a new redis connection if there is none yet for the current application context
    """
    if not hasattr(g, 'redis_client'):
        g.redis_client = RedisClient()
    return g.redis_client

@app.route('/')
def index():
    return '<h2>Welcome to Proxy Pool System</h2>'

@app.route('/get')
def get_proxy():
    """
    Get a proxy
    """
    conn = get_conn()
    return conn.pop()

@app.route('/count')
def get_counts():
    """
    Get the count of proxies
    """
    conn = get_conn()
    return str(conn.queue_len)

if __name__ == '__main__':
    app.run()

shcedule.py

import aiohttp
import asyncio
from proxypool.db import RedisClient
from proxypool.error import ResourceDepletionError
from proxypool.getter import FreeProxyGetter
from proxypool.setting import *
import time
from multiprocessing import Process
try:
    from aiohttp.errors import ProxyConnectionError, ServerDisconnctedError, ClientResponseError
except:
    from aiohttp import ClientProxyConnectionError as ProxyConnectionError, ServerDisconnectedError, ClientResponseError, ClientConnectionError


class ValidityTester(object):
    test_api = TEST_API

    def __init__(self):
        self._raw_proxies = None
        self._usable_proxies = []

    def set_raw_proxies(self, proxies):
        self._raw_proxies = proxies
        self._conn = RedisClient()

    async def test_single_proxy(self, proxy):
        """
        text one proxy, if valid, put them to usable_proxies
        """
        try:
            async with aiohttp.ClientSession() as session:
                try:
                    if isinstance(proxy, bytes):
                        # 设置类型,需要转码
                        proxy = proxy.decode('utf-8')
                    real_proxy = 'http://' + proxy
                    print('Testing', proxy)
                    async with session.get(self.test_api, proxy=real_proxy, timeout=get_proxy_timeout) as response:
                        if response.status == 200:
                            self._conn.put(proxy)
                            print('Valid proxy', proxy)
                except (ProxyConnectionError, TimeoutError, ValueError):
                    print('Invalid proxy', proxy)
        except (aiohttp.ServerDisconnectedError, aiohttp.ClientResponseError, aiohttp.ClientConnectorError) as s:
            print(s)
            pass

    def test(self):
        """
        aio test all proxies
        """
        print('ValidityTester is working')
        try:
            loop = asyncio.get_event_loop()
            tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
            loop.run_until_complete(asyncio.wait(tasks))
        except ValueError:
            print('Async Error')

class PoolAdder(object):
    """
    add proxy to pool
    """

    def __init__(self, threshold):
        self._threshold = threshold
        self._conn = RedisClient()
        self._tester = ValidityTester()
        self._crawler = FreeProxyGetter()

    def is_over_threshold(self):
        """
        judge if count is overflow
        """
        if self._conn.queue_len >= self._threshold:
            return True
        else:
            return False

    def add_to_queue(self):
        print('PoolAdder is working')
        proxy_count = 0
        while not self.is_over_threshold():
            for callback_lable in range(self._crawler.__CrawlFuncCount__):
                callback = self._crawler.__CrawlFunc__[callback_lable]
                raw_proxies = self._crawler.get_raw_proxies(callback)
                # test crawled proxies
                self._tester.set_raw_proxies(raw_proxies)
                self._tester.test()
                proxy_count += len(raw_proxies)
                if self.is_over_threshold():
                    print('IP is enough, waiting to be used')
                    break
            if proxy_count ==0:
                raise ResourceDepletionError

class Schedule(object):
    @staticmethod
    def valid_proxy(cycle=VALID_CHECK_CYCLE):
        """
        Get half of proxies which in redis
        """
        conn = RedisClient()
        tester = ValidityTester()
        while True:
            print('Refreshing ip')
            count = int(0.5 * conn.queue_len)
            if count == 0:
                print('Waiting for adding')
                time.sleep(cycle)
                continue
            raw_proxies = conn.get(count)
            tester.set_raw_proxies(raw_proxies)
            tester.test()
            time.sleep(cycle)

    @staticmethod
    def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
                   upper_threshold=POOL_UPPER_THRESHOLD,
                   cycle=POOL_LEN_CHECK_CYCLE):
        """
        If the number of proxies less than lower_threshold, add proxy
}        """
        conn = RedisClient()
        adder = PoolAdder(upper_threshold)
        while True:
            if conn.queue_len < lower_threshold:
                adder.add_to_queue()
            time.sleep(cycle)

    def run(self):
        print('IP processing running')
        valid_process = Process(target=Schedule.valid_proxy)
        check_process = Process(target=Schedule.check_pool)
        valid_process.start()
        check_process.start()

getter.py

from .utils import get_page
from pyquery import PyQuery as pq
import re

class ProxyMetaclass(type):
    """
    元类,在FreeProxyGetter类中加入__CrawlFunc__和__CrawlFuncCount__两个参数,
    分别表示爬虫函数和爬虫函数的数量
    """
    def __new__(cls, name, bases, attrs):
        count = 0
        attrs['__CrawlFunc__'] = []
        for k,v in attrs.items():
            if 'crawl_' in k:
                attrs['__CrawlFunc__'].append(k)
                count += 1
        attrs['__CrawlFuncCount__'] = count
        return type.__new__(cls, name, bases, attrs)

class FreeProxyGetter(object, metaclass=ProxyMetaclass):
    def get_raw_proxies(self, callback):
        proxies = []
        print('Callback', callback)
        for proxy in eval("self.{}()".format(callback)):
            print('Getting', proxy, 'from', callback)
            proxies.append(proxy)
        return proxies

    def crawl_kuaidaili(self):
        for page in range(1,4):
            # 国内高匿代理
            start_url = 'https://www.kuaidaili.com/free/inha/{}/'
            html = get_page(start_url)
            ip_address = re.compile(
            '<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>'
            )
            re_ip_address = ip_address.findall(str(html))
            for address, port in re_ip_address:
                result = address + ":" + port
                yield result.replace(' ', '')

    def crawl_xicidaili(self):
        for page in range(1,4):
            start_url = 'http://www.xicidaili.com/nn/{}'.format(page)
            html = get_page(start_url)
            ip_address = re.compile(
                '<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" '
                'alt="Cn"></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>'
            )
            # \s* 匹配空格,起到换行作用
            re_ip_address = ip_address.findall(str(html))
            for address, port in re_ip_address:
                result = address + ':' + port
                yield result.replace(' ', '')

    def crawl_daili66(self, page_count=4):
        start_url = 'http://www.66ip.cn/{}.html'
        urls = [start_url.format(page) for page in range(1, page_count+1)]
        for url in urls:
            print('Crawling', url)
            html = get_page(url)
            if html:
                doc = pq(html)
                trs = doc('.containerbox table tr:gt(0)').items()
                for tr in trs:
                    ip = tr.find('td:nth-child(1)').text()
                    port = tr.find('td:nth-child(2)').text()
                    yield ':'.join([ip, port])

    def crawl_data5u(self):
        for i in ['gngn', 'gnpt']:
            start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i)
            html = get_page(start_url)
            ip_address = re.compile(
                '<ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;">'
                '<li class=".*">(.*?)</li></span>'
            )
            # \s * 匹配空格,起到换行作用
            re_ip_address = ip_address.findall(str(html))
            for address, port in re_ip_address:
                result = address + ":" + port
                yield result.replace(' ', '')

    def crawl_kxdaili(self):
        for i in range(1,4):
            start_url = 'http://www.ip.kxdaili.com/dailiip/1/{}.html#ip'.format(i)
            html = get_page(start_url)
            ip_address = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
            # \s *匹配空格,起到换行作用
            re_ip_address = ip_address.findall(str(html))
            for address, port in re_ip_address:
                result = address + ":" + port
                yield result.replace(' ', '')

# proxy = FreeProxyGetter()
# proxy.get_raw_proxies()

db.py

import redis
from proxypool.error import PoolEmptyError
from proxypool.setting import HOST, PORT, PASSWORD

class RedisClient(object):
    def __init__(self, host=HOST, port=PORT):
        if PASSWORD:
            self._db = redis.Redis(host=host, port=port, password=PASSWORD)
        else:
            self._db = redis.Redis(host=HOST, port=PORT)

    def get(self, count=1):
        """
        从左侧批量拿出代理,左侧为老化的代理,右侧为更新的
        """
        proxies = self._db.lrange("proxies", 0, count - 1)
        # 对列表进行修剪trim,只保留区间内的值
        self._db.ltrim("proxies", count, -1)
        return proxies

    def put(self, proxy):
        """
        向右侧添加元素
        """
        self._db.rpush("proxies", proxy)

    def pop(self):
        """
        供API调用的,get proxy from right
        返回并删除队列的尾元素
        """
        try:
            return self._db.rpop("proxies").decode('utf-8')
        except:
            raise PoolEmptyError
    # 装饰器
    @property
    def queue_len(self):
        """
        get length from queue
        """
        return self._db.llen("proxies")

    def flush(self):
        """
         刷新整个队列
        """
        self._db.flushall()


if __name__ == '__main__':
    conn = RedisClient()
    print(conn.pop())

settings.py

# Redis数据库的地址和端口
HOST = 'localhost'
PORT = 6379

# 如果Redis有密码,则添加这句密码,否则设置为None或''
PASSWORD = ''

# 获得代理测试时间界限
get_proxy_timeout = 9

# 代理池数量限制
POOL_LOWER_THRESHOLD = 20
POOL_UPPER_THRESHOLD = 100

# 检查周期
VALID_CHECK_CYCLE = 60
POOL_LEN_CHECK_CYCLE = 20

# 测试API,用百度来测试
TEST_API = 'http://www.fang.com/SoufunFamily.htm'

error.py

class ResourceDepletionError(Exception):

    def __init__(self):
        Exception.__init__(self)

    def __str__(self):
        return repr('The proxy source is exhausted')

class PoolEmptyError(Exception):

    def __init__(self):
        Exception.__init__(self)

    def __str__(self):
        return repr('The proxy pool is empty')

搜狗微信文章爬取

run.py

from weixin.spider import Spider

if __name__ == '__main__':
    spider = Spider()
    spider.run()

spider.py

from urllib.parse import urlencode
from pyquery import PyQuery as pq
import requests
from requests import ReadTimeout, Session

from weixin.config import PROXY_POOL_URL, VALID_STATUS, MAX_FAILED_TIME, KEYWORD
from weixin.db import RedisQueue
from weixin.mysql import MySQL
from weixin.request import WeixinRequest


class Spider():
    base_url = 'https://weixin.sougou.com/weixin?'
    keyword = KEYWORD
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'Cookie': 'CXID=68999D20535A955E54EEB369EEBDAA87; SUID=7D0481DF3565860A5B922DAB00041476; '
                  'SUV=00724ADFDF81047D5B9390FE3CE03520; ad=Ukllllllll2b6ALrlllllVmUX@1lllllTc99Kyllll'
                  '9llllljylll5@@@@@@@@@@; IPLOC=CN5101; ABTEST=0|1536564030|v1; weixinIndexVisited=1; '
                  'SNUID=6AEE6B35EBEE9D9F5957A098EBEC0DF0; sct=1; JSESSIONID=aaaqsTn37HldSeg_akWyw; '
                  'ppinf=5|1538793682|1540003282|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo0NTol'
                  'RTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8Y3J0OjEwOjE1Mzg3OTM2ODJ8cmVm'
                  'bmljazo0NTolRTYlODUlQTIlRTYlODUlQTIlRTYlODUlQTIlRTYlOTclQjYlRTUlODUlODl8dXNlcmlkOjQ0Om85d'
                  'DJsdURabHBHRjJ1TF9vbGtrV01MbTlHWFFAd2VpeGluLnNvaHUuY29tfA; pprdig=YXVgbs0p9dU4aBgDw7V_id'
                  'ljKjCcGiXgeUpafLd_FO65GO0AMS3VWq_ogoKBR7XpAChV9r3DxwwMN_lwgpTwjbT4al7JXyKKOua-q3IoMvfo2KwI1'
                  'sXoNQKlyuxomXov9kuvMJkAHq4x6HCYOtsNhkW92H_acgTIeDo65hnDIbc; sgid=15-37413245-AVu4ININKITuO'
                  '1IBrovHceA; ppmdig=153880606700000019649cd69fcbff1cb91d0c6884906b6b; LSTMV=469%2C259; LCLKINT=5007',
        'Host': 'weixin.sogou.com',
        'Upgrade-Insecure-Requests': '1',
    # 为了对付防盗链,对方服务器会事变header中的Referer是不是自己的,所以我们会在头部中加上Referer
        'Referer': 'https://weixin.sogou.com/weixin?query=%E9%A3%8E%E6%99%AF&type=2&page=17&ie=utf8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
    }
    # 初始化Session和RedisQueue MySQL对象,分别执行请求、代理调用、存储要求
    session = Session()
    queue = RedisQueue()
    mysql = MySQL()

    def get_proxy(url):
        """
        从代理池中获取代理
        """
        try:
            response = requests.get(PROXY_POOL_URL)
            if response.status_code == 200:
                print('Get Proxy', response.text)
                return response.text
            return None
        except requests.ConnectionError:
            return None

    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers,使得所有请求都能应用Cookies
        self.session.headers.update(self.headers)
        # 起始URL的构造
        start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
        # 构造WeixinRequest对象,回调函数:请求成功后用parse_index()处理和解析 need_proxy参数执行请求须用代理
        weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
        # 请求加入队列,调度第一个请求
        self.queue.add(weixin_request)

    def parse_index(self, response):
        """
        解析索引页
        :param response: 响应
        :return: 新的响应
        """
        doc = pq(response.text)
        # 获取本页所有的微信文章链接
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            # 构造成WeixinRequest之后yield返回
            weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
            yield weixin_request
        # 获取下一页的链接
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.base_url + str(next)
            # 构造成WeixinRequest之后yield返回
            weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
            yield weixin_request

    def parse_detail(self, response):
        """
        解析详情页
        :param response: 响应
        :return: 微信公众号文章
        """
        doc = pq(response.text)
        # 提取标题、正文文本、发布日期、发布人昵称、公众号名称,组合成字典返回
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('.rich_media_content').text(),
            'date': doc('#publish_time').text(),
            'nickname': doc('#meta_content > span.rich_media_meta.rich_media_meta_text').text(),
            'wechat': doc('#profileBt > #js_name').text()
        }
        yield data
        # 返回之后需要判断类型,字典类型调用mysql对象的insert()方法存入数据库


    def request(self, weixin_request):
        """
        执行请求
        :param weixin_request: 请求
        :return: 响应
        """
        try:
            # 先判断请求是否需要代理,调用Session的send()方法执行请求
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': 'http://' + proxy,
                        'https': 'https://' + proxy
                    }
                    return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout,
                                             allow_redirects=False, proxies=proxies)
                # 请求调用prepare()方法转化为Prepared Request,不重定向,请求超时时间,响应返回
                return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False)
        except (ConnectionError, ReadTimeout) as e:
            print(e.args)
            return False

    def error(self, weixin_request):
        """
        错误处理
        """
        weixin_request.fail_time = weixin_request.fail_time + 1
        print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url)
        if weixin_request.fail_time < MAX_FAILED_TIME:
            self.queue.add(weixin_request)

    def schedule(self):
        """
        调度请求,schedule()方法,内部是一个循环,条件:队列不为空
        """
        while not self.queue.empty:
            # 调用pop()方法取出下一个请求,request()方法执行请求
            # 第一次循环结束,while继续执行,队列包含第一页内容的文章详情页请求和下一页的请求,
            # 第二次循环得到的下一个请求是文章详情页的请求,重新调用request()方法获得响应,对应回调函数parse_detail()
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule', weixin_request.url)
            response = self.request(weixin_request)
            # request()方法得到Response对象的状态码合法判断,调用WeixinRequest的回调函数(parse_index())解析
            if response and response.status_code in VALID_STATUS:
                results = list(callback(response))
                # schedule()方法将返回结果遍历,利用isinstance()方法判断返回结果
                if results:
                    for result in results:
                        print('New Result', result)
                        # 判断类型是否相同
                        if isinstance(result, WeixinRequest):
                            self.queue.add(result)
                        if isinstance(result, dict):
                            self.mysql.insert('articles', result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def run(self):
        """
        入口
        :return:
        """
        self.start()
        self.schedule()

if __name__ == '__main__':
    spider = Spider()
    spider.run()

request.py

from requests import Request
from weixin.config import TIMEOUT

class WeixinRequest(Request):
    def __init__(self, url, callback, method='GET', headers=None, need_proxy=False, fail_time=0, timeout=TIMEOUT):
        Request.__init__(self, method, url, headers)
        self.callback = callback
        self.need_proxy = need_proxy
        self.fail_time = fail_time
        self.timeout = timeout

db.py

from pickle import dumps, loads
from redis import StrictRedis
from weixin.config import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_KEY
from weixin.request import WeixinRequest

class RedisQueue():
    def __init__(self):
        """
        初始化Redis
        """
        self.db = StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD)

    def add(self, request):
        """
        向队列添加序列化后的Request
        :param request: 请求对象
        :return: 添加结果
        """
        if isinstance(request, WeixinRequest):
            # 用pickle的dumps()方法序列化request,再用rpush加入队列
            return self.db.rpush(REDIS_KEY, dumps(request))
        return False

    def pop(self):
        """
        取出下一个Request并反序列化
        :return: Reqiest or None
        """
        if self.db.llen(REDIS_KEY):
            # 调用lpop()方法将请求从队列中取出,再用pickle的loads()方法转化为WeixinRequest对象
            return loads(self.db.lpop(REDIS_KEY))
        else:
            return False

    def clear(self):
        self.db.delete(REDIS_KEY)

    def empty(self):
        return self.db.llen(REDIS_KEY) == 0

if __name__ == '__main__':
    db = RedisQueue()
    start_url = 'http://www.baidu.com'
    weixin_request = WeixinRequest(url=start_url, callback='hello', need_proxy=True)
    db.add(weixin_request)
    request = db.pop()
    print(request)
    print(request.callback, request.need_proxy)

mysql.py

import pymysql
from weixin.config import *

class MySQL():
    def __init__(self, host=MYSQL_HOST, username=MYSQL_USER, password=MYSQL_PASSWORD, port=MYSQL_PORT,
                 database=MYSQL_DATABASE):
        """
        MySQL初始化
        :param host: 用于指定请求资源的主机IP和端口号,内容为请求URL的原始服务器或网关位置
        :param username:
        :param password:
        :param port:
        :param database:
        """
        try:
            # connect()方法声明一个MySQL连接对象db
            self.db = pymysql.connect(host, username, password, database, charset='utf8', port=port)
            # 连接成功调用cursor()方法获得MySQL的操作游标,利用游标执行SQL语句
            self.cursor = self.db.cursor()
        except pymysql.MySQLError as e:
            print(e.args)

    def insert(self, table, data):
        """
        插入数据
        :param table:
        :param data:
        :return:
        """
        keys = '.'.join(data.keys())
        values = '.'.join(['%s'] * len(data))
        # 构造SQL语句,value值 格式化%s实现,再用统一的元祖传到execute()方法里
        sql_query = 'insert into %s (%s) values (%s)'%(table, keys, values)
        try:
            self.cursor.execute(sql_query, tuple(data.values()))
            # commit是真正将语句提交到数据库执行的方法
            self.db.commit()
        except pymysql.MySQLError as e:
            print(e.args)
            # 异常处理,执行失败,用rollback()执行数据回滚,事务机制确保数据一致性
            self.db.rollback()

config.py

PROXY_POOL_URL = 'http://127.0.0.1:5555/random'

VALID_STATUS = [200]

TIMEOUT = 10

REDIS_HOST = 'localhost'

REDIS_PORT = 6379
REDIS_PASSWORD = ''
REDIS_KEY = 'weixin'

MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = ''
MYSQL_DATABASE = 'weixin'

MAX_FAILED_TIME = 20
KEYWORD = '风景'
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值