scrapy 分布式爬虫之URL生产者

  • 解决列表页可配置URL提取、自动翻页、自动结束的问题,功能强大,代码紧凑,生产环境可用
  • 爬虫URL生产者
  • 爬虫URL消费者可以参考
  • 支持GET、POST提取URL列表
  • 翻页结束条件支持总页数、第几页、下一页为空三种
  • 支持指定起始页
  • 支持提取子页(文章评论、论坛跟帖,先爬id再拼接URL等)URL提取
  • 支持同一域名,多版块URL提取简写【a,b,c】实现
  • 支持不同意URL优先提取功能,配置level
  • sub、{}、=>、==、【】等自定义占位符实现
import pandas as pd
import requests
from scrapy.selector import Selector
import re
from urllib import parse
import json
import jsonpath
import math
import urllib3
import copy
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class Goto:
    def __init__(self):
        self.over_max_num = 100

    def first_request(self, row, urls, total_num=-1):
        start_url = row.get('start_url')
        page_url = row.get('page_url') if not self.isnan(row.get('page_url')) else ''
        page_extract = ''
        request_type = row.get('request_type') if not self.isnan(row.get('request_type')) else 'GET'
        pre_re = row['pre_re']
        detail = row['detail']
        total_count_path = row['total_count']
        total_page_path = row['total_page']
        over = row['over']
        if '{' in page_url:
            page_extract = page_url[page_url.find('{') + 1:page_url.find('}')]
            if '//' in page_extract or '$' in page_extract:
                if total_num == -1:
                    page_url = ''
                else:
                    # //div=>123 拆解示例 xpath翻页
                    page_url = page_url.replace(page_url[page_url.find('{') + 1:page_url.find('=>') + 2], '')
                    row['page_url'] = row['page_url'].replace(
                        row['page_url'][row['page_url'].find('{') + 1:row['page_url'].find('}')],
                        page_extract.split('=>')[0]
                    )
            page_url = page_url.replace('{', '').replace('}', '')
        try:
            res = self.__get_response(start_url=start_url, page_url=page_url, request_type=request_type)
        except:
            return []
        if self.isnan(detail):
            urls.append(parse.urljoin(start_url, page_url))
        else:
            detail_path = detail
            pre_re_extract = res.text
            if not self.isnan(pre_re):
                pre_re_extract = self.__extract_filer(res.text, pre_re)
                if len(pre_re_extract) > 0:
                    pre_re_extract = pre_re_extract[0]
            detail_url = ''  # 自定义详情页url拼接时候使用
            if '{' in detail_path and '}' in detail_path:
                detail_url = detail_path
                detail_path = detail_url[detail_url.find('{') + 1:detail_url.find('}')]
            detail_list = self.__extract_filer(text=pre_re_extract, path=detail_path)
            for _, v in enumerate(detail_list):
                if detail_url == '':
                    urls.append(parse.urljoin(start_url, v))
                else:
                    urls.append(detail_url.replace(detail_url[detail_url.find('{'):detail_url.find('}') + 1], str(v)))
        # 返回单页URL
        if len(page_url) == 0 and ('//' not in page_extract or '$' not in page_extract):
            return urls
        # 结束计算
        if self.isnan(over):
            # 第一次执行
            if total_num == -1:
                if not self.isnan(total_count_path):
                    arr = self.__extract_filer(text=res.text, path=total_count_path)
                    total_num = max(math.ceil(int(arr[0]) / len(detail_list)), 1) if len(arr) > 0 else 0
                elif not self.isnan(total_page_path):
                    arr = self.__extract_filer(text=res.text, path=total_page_path)
                    total_num = int(arr[0]) if len(arr) > 0 else 0
        else:
            curr_value, over_value = over.split('==')[0], over.split('==')[1]
            curr_value = self.__extract_filer(res.text, curr_value)
            curr_value = curr_value[0] if len(curr_value) > 0 else False
            if total_num == -1:
                total_num = self.over_max_num
            else:
                if not curr_value or curr_value == over_value:
                    total_num = 0
        total_num -= 1
        if total_num >= 1:
            self.__next_request(row=row, res=res, request_type=request_type, urls=urls, total_num=total_num)
        # 回滚一条无效url, 递归最外层self.over_max_num - 1
        urls = urls[0:-1] if total_num == self.over_max_num - 1 else urls
        return urls

    #  post请求数据处理
    def __get_post_data(self, page_url):
        data = {}
        page_url = page_url.replace('\t', '').replace('\r', '').replace('\n', '')
        arr = page_url.split(',')
        for i, v in enumerate(arr):
            v = v.replace("\n", '')
            v = v.strip()
            key = v.split(':')[0].strip()
            value = v.split(':')[1].strip()
            data[key] = value
        return data

    # 详情页之后接口调用支持url,url_list,html
    def sub_row(self, row, master_urls):
        sub_url_list = []
        if isinstance(master_urls, str):
            master_urls = [master_urls]
        for _, v in enumerate(master_urls):
            page_url = row['page_url']
            new_page_url = self.__get_page_param(page_url=page_url, url=v)
            if not new_page_url:
                print(f'sub提取url __get_page_param提取{v}参数错误')
                continue
            row['page_url'] = new_page_url
            sub_url_list += self.first_request(row=row, urls=[])
        return sub_url_list

    # 遍历起始url
    def for_url(self, row, keywords):
        page_url = row.get('page_url') if not self.isnan(row.get('page_url')) else ''
        row_list = []
        if page_url:
            if isinstance(keywords, str):
                keywords = keywords.replace('\n', '').replace('\r', '').replace('\t', '').strip().split(',')
            for _, v in enumerate(keywords):
                new_row_dict = copy.deepcopy(row)
                new_page_url = page_url.replace(page_url[page_url.find('【'):page_url.find('】') + 1], str(v))
                if not new_page_url:
                    print(f'for_url提取关键词 {v}参数错误')
                    continue
                new_row_dict['page_url'] = new_page_url
                row_list.append(new_row_dict)
        else:
            row_list.append(row)
        return row_list

    # 获取page_url起始【】里的内容
    def get_for_url_keyword(self, row):
        page_url = row.get('page_url') if not self.isnan(row.get('page_url')) else ''
        keyword = ''
        if '【' in page_url and '】' in page_url:
            keyword = str(page_url[page_url.find('【') + 1:page_url.find('】')]).strip()
        return keyword

    #  下一页调用
    def __next_request(self, row, res, request_type, urls, total_num):
        page_url = row['page_url']
        s0 = page_url.find('{')
        s1 = page_url.find('}')
        page_extract = page_url[page_url.find('{') + 1:page_url.find('}')]
        start_flag = self.__extract_filer(res.text, page_extract)[0]
        if '//' in page_extract:
            start_flag = f'{page_extract}=>{start_flag}'
        else:
            start_flag = str(int(start_flag) + 1)
        next_page = page_url[:s0 + 1] + start_flag + page_url[s1:]
        if request_type == 'POST':
            arr = next_page.split(',')
            for k, v in enumerate(arr):
                if '=//' in v:
                    v = v.replace("\n", '')
                    v = v.strip()
                    key = v.split('=//')[0]
                    value = v.split('=//')[-1]
                    value = f"//{value.split(':')[0]}"
                    arr[k] = f"{key}={value}:{self.__extract_filer(res.text, value)[0]}"
            next_page = ','.join(arr)
        # print(f'next_page:{next_page}')
        row['page_url'] = next_page
        self.first_request(row=row, urls=urls, total_num=total_num)

    #  请求一个URL,返回selector
    def __get_response(self, start_url, page_url=None, request_type='GET'):
        url = start_url
        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:26.0) Gecko/20100101 Firefox/26.0'}
        if request_type == 'GET':
            if page_url:
                url = parse.urljoin(start_url, page_url)
            # print(f'url: {url}')
            res = requests.get(url, headers=header, verify=False, timeout=30)
        elif request_type == 'POST':
            data = self.__get_post_data(page_url=page_url)
            # print('data', data)
            res = requests.post(url, data=data, verify=False, timeout=30)
        res.encoding = 'utf-8'
        return res

    # path提取加正则过滤,其中提取支持json、xpath、整数、正则
    def __extract_filer(self, text, path):
        grep = None
        value = []
        if isinstance(path, str):
            if '|' in path:
                grep = path.split('|')[-1]
                path = path.split('|')[0]
            if '$' in path[0]:
                obj = json.loads(text)
                temp = jsonpath.jsonpath(obj, path)
                value = jsonpath.jsonpath(obj, path) if temp else []
            elif '//' in path[:2]:
                selector = Selector(text=text)
                value = selector.xpath(path).extract()
            else:
                value = self.__filter(obj=text, regular=path)
            if grep:
                if len(value) > 0:
                    value = self.__filter(obj=value, regular=grep)
                else:
                    value = self.__filter(obj=text, regular=grep)
        elif isinstance(path, int) or isinstance(path, float):
            value = [path]
        return value

    # 对get请求参数计算
    def __get_page_param(self, page_url, url):
        if '&' in page_url:
            params = str(page_url).split('&')
        elif '?' in page_url:
            params = str(page_url).split('?')
        for i, param in enumerate(params):
            if '=>' in param:
                k_v = str(param).split('=>')
                value = self.__extract_filer(url, path=k_v[-1])
                if len(value) == 0:
                    return False
                k_v[-1] = value[0]
                param = '='.join(k_v)
            params[i] = param
        page_url = '&'.join(params)
        return page_url

    # 正则函数
    def __filter(self, obj, regular):
        text = obj
        if not isinstance(obj, str):
            text = ''.join(obj)
        pattern = re.compile(regular)
        return pattern.findall(text)

    # 空值判断
    def isnan(self, num):
        return num != num


if __name__ == '__main__':
    import time
    t = time.time()
    excel_path = '../../data/goto.xlsx'
    df = pd.read_excel(excel_path)
    goto = Goto()
    # 0 暂停信源
    df = df[df['level'] != 0]
    df = df.sort_values('level', axis=0, ascending=False)
    keyword_dict = {}
    words_df = pd.read_csv(f'../../data/goto_keywords.csv')
    keyword_dict[words_df['words'].values.tolist()[0]] = words_df['words'].values.tolist()
    keyword_dict[words_df['code_utf-8'].values.tolist()[0]] = words_df['code_utf-8'].values.tolist()
    keyword_dict[words_df['code_gbk'].values.tolist()[0]] = words_df['code_gbk'].values.tolist()
    test_http = 'http://info.xcar.com.cn/202007/news_2049855_1.html'
    domain_valid_dict = {'count': 0, 'urls_len': 0}
    for i, row in df.iterrows():
        if 'sub' not in row['domain_flag']:
            row_dict = row.to_dict()
            words = goto.get_for_url_keyword(row_dict).upper()
            # 默认都是用英文字母大写转换
            words = keyword_dict[words] if keyword_dict.get(words) else words
            row_list = goto.for_url(row_dict, words)
            for _, v in enumerate(row_list):
                urls = list(set(goto.first_request(row=v, urls=[])))
                print(f'第{i}个信源 {v["source"]} {len(urls)} {urls}')
                domain_valid_dict['count'] += 1
                domain_valid_dict['urls_len'] += len(urls)
        else:
            urls = goto.sub_row(row=row.to_dict(), master_urls=test_http)
            print(f'第{i}个信源 level: {row["level"]} {row["source"]} {len(urls)} {urls}')
            domain_valid_dict['count'] += 1
            domain_valid_dict['urls_len'] += len(urls)
    print(f'有效信源:{domain_valid_dict["count"]}个,'
          f' 共提取url:{domain_valid_dict["urls_len"]}个, '
          f'总耗时:{round(time.time() - t, 2)}秒, '
          f'平均耗时:{round(domain_valid_dict["urls_len"]/(time.time() - t), 2)}条/秒')

goto URL配置文件截图
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值