多线程爬取百度关键字结果,并获取真实url

项目目的:练习

项目要求:根据给定的关键字,检索百度的结果,将结果保存到文件中

遇到问题:

1、python  list取值问题,有些看不清晰的,用for index, item in enumerate(array):查看

2、选取想要的元素 ,两种方式:

一是tag.h3.a['href'],

二是tagh3 = result.find_all('h3');for h3 in tagh3:href = h3.find('a').get('href')

3、构建网址

out_url = [(key, page, "https://www.baidu.com/s?wd={}&pn={}".format(key, page * 10),) for key in keys for page in range(pages)]

4、结果去除百度自己的内容

title = tag.h3.a.text;if '百度' in title:break

5、去除类似视频大全的百度整合内容

if not href.startswith('http'):break

6、获取百度搜索的真实网址

baidu_url = requests.get(url=href, headers=myhead, allow_redirects=False)

real_url = baidu_url.headers['Location'] # 得到网页原始地址

if real_url.startswith('http'):

allow_redirects=False是重点,禁止跳转

7、任务和结果传递

self.work_queue = Queue() # 任务队列

self.result_queue = Queue() # 结果队列

 

8、多线程卡死

一定用   while not self.work_queue.empty():,不用写成 while True:

9、没了,上代码,为了便于调试,做了微调,代码里面有说明

# !/usr/bin/env python
# -*- coding:utf -8-*-

import time
from retrying import retry
import requests
from bs4 import BeautifulSoup
import threading
from queue import Queue

lock = threading.RLock()


class WorkManager(object):
    def __init__(self, do_job, works, thread_num=25):
        self.job = do_job
        self.work_queue = Queue()  # 任务队列
        self.result_queue = Queue()  # 结果队列
        self.threads = []
        self.__init_work_queue(works)
        self.__init_thread_pool(thread_num)

    # #初始化工作队列,添加工作入队
    def __init_work_queue(self, works):
        for item in works:
            # print('__init_work_queue item:', item)  # 参数tupe
            self.work_queue.put((self.job, item))  # 将任务函数和参数传入任务队列

    # #初始化线程,同时运行线程数量有效果,原理没明白
    def __init_thread_pool(self, thread_num):
        for i in range(thread_num):
            self.threads.append(Work(self.work_queue, self.result_queue))

    # #等待所有线程运行完毕
    def wait_allcomplete(self):
        '''
        @description:等待线程结束,并取得运行结果
        @return:result_list
        '''
        for item in self.threads:
            if item.isAlive():
                item.join()

        result_list = []
        for i in range(self.result_queue.qsize()):
            res = self.result_queue.get()
            #print('wait_allcomplete:', res)
            result_list.append(res)
        return result_list


class Work(threading.Thread):
    def __init__(self, work_queue, result_queue):
        threading.Thread.__init__(self)
        self.work_queue = work_queue
        self.result_queue = result_queue
        self.start()  # 启动线程

    def run(self):
        # 一定不用死循环
        while not self.work_queue.empty():
            try:
                do, args = self.work_queue.get(block=False)  # 任务异步出队
                # print('Work args:', args)  # 参数list or tupe,注意检查此处
                result = do(*args)  # 传递  list or tupe 各元素
                #print('work run result:', result, flush=True)
                self.result_queue.put(result)  # 取得函数返回值
                self.work_queue.task_done()  # 通知系统任务完成
                with lock:
                    print('{}\tdone\twith\t{}\tat\t{}'.format(threading.currentThread().name, args[0], get_stime()), flush=True)
            except Exception as error:
                print(error, flush=True)
                break


def get_stime():
    ct = time.time()
    local_time = time.localtime(ct)
    data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
    data_secs = (ct - int(ct)) * 1000
    stamp = "%s.%03d" % (data_head, data_secs)
    return stamp


myhead = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
    'Accept-Encoding': 'gzip,deflate,sdch, br',
    'Accept-Language': 'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4',
    'Cache-Control': 'max-age=0',
    'Connection': 'close',
    'Proxy-Connection': 'no-cache'
}


def parse_url(url, params=None, headers=myhead, proxies=None, timeout=6, ecode='utf-8',
              wait_random_min=200, wait_random_max=3000, stop_max_attempt_number=100):

    @retry(wait_random_min=wait_random_min, wait_random_max=wait_random_max, stop_max_attempt_number=stop_max_attempt_number)
    def _parse_url(url):
        response = requests.get(url, params=params, headers=headers, proxies=proxies, timeout=timeout)
        assert response.status_code == 200
        # 由于status_code == 200,所以不能用于百度真是网址获取,因其code==302
        return response.content.decode(ecode)

    try:
        response = _parse_url(url)
        soup = BeautifulSoup(response, 'lxml')
        [s.extract() for s in soup(["script", "style"])]
    except requests.exceptions.ConnectionError as e:
        print('ConnectionError:', e, url, flush=True)
        soup = None
    except requests.exceptions.ChunkedEncodingError as e:
        print('ChunkedEncodingError:', e, url, flush=True)
        soup = None
    except Exception as e:
        print('Unfortunitely Unknow Error:', e, url, flush=True)
        soup = None
    return soup


def fd():
    import win32ui
    _dlg = win32ui.CreateFileDialog(1)  # 1表示打开文件对话框
    _dlg.SetOFNInitialDir('c:/')  # 设置打开文件对话框中的初始显示目录
    _dlg.DoModal()
    filename = _dlg.GetPathName()  # 获取选择的文件名称
    return filename


def make_urls(pages):
    '''
    _k = []
    _file = fd()
    if not _file:
        return False
    res = _file.split('.')[0:-1]  # 文件名,含完整路径,去掉后缀

    with open(_file) as f:
        for row in f.readlines():
            row = row.strip()  # 默认删除空白符  '#^\s*$'
            if len(row) == 0:
                break  # 去除行len为0的行
            _k.append(row)
    keys = sorted(set(_k), key=_k.index)
    #为方便演示,用list直接替代读文件
    '''
    keys = [
        "减肥计划",
        "减肥运动",
        "如何减肥",
        "怎么减肥",
        "有效减肥",
        "郑多燕减肥",
        "减肥视频",
        "减肥",
        "减肥方法",
        "减肥食谱",
        "   ",
        "减肚子",
        "腰腹减肥",
        "\t",
        "减腰",
        "减肥法",
        "减肥法"
    ]

    out_url = [(key, page, "https://www.baidu.com/s?wd={}&pn={}".format(key, page * 10),) for key in keys for page in range(pages)]
    return 'baidu', out_url
    # return res[0], out_url


def getkeys(key, page, url):
    _texts = []
    result = parse_url(url=url)
    '''
    #方法1
    tagh3 = result.find_all('h3')
    index = 0
    for h3 in tagh3:
        href = h3.find('a').get('href')
        title = h3.find('a').text
        if '百度' in title:
            break
        if not href.startswith('http'):
            break
        baidu_url = requests.get(url=href, headers=myhead, allow_redirects=False)  # 禁止跳转
        real_url = baidu_url.headers['Location']  # 得到网页原始地址
        if real_url.startswith('http'):
            index += 1
            _texts.append([index, title, real_url])
    #方法1结束
    '''

    # 方法2,效果与方法1相同
    allTags = result.findAll('div', ['result-op c-container xpath-log', 'result c-container'])
    # 'result-op c-container xpath-log'   #百度自己内容
    index = 0
    for tag in allTags:
        href = tag.h3.a['href']
        title = tag.h3.a.text
        if '百度' in title:
            break
        if not href.startswith('http'):
            break
        baidu_url = requests.get(url=href, headers=myhead, allow_redirects=False)
        real_url = baidu_url.headers['Location']  # 得到网页原始地址
        if real_url.startswith('http'):
            index += 1
            _texts.append([key, page, index, title, real_url])
    # 方法2结束

    return _texts


def savefile(_filename, lists):
    # 函数说明:将爬取的文章lists写入文件
    print('[' + _filename + ']开始保存......', end='', flush=True)
    lists.sort()

    with open(_filename, 'a', encoding='utf-8') as f:
        for lists_line in lists:
            for index, item in enumerate(lists_line):
                f.write('key:' + item[0] + '\tpage:' + str(item[1]) + '\tindex:' + str(item[2]) + '\ttitle:' + item[3] + '\turl:' + item[4] + '\n')

    print('[' + _filename + ']保存完成。', flush=True)


def main():
    start = time.time()
    try:
        _name, urls = make_urls(10)
    except Exception as e:
        print(e)
        return False

    work_manager = WorkManager(getkeys, urls)  # 调用函数,参数:list内tupe,线程数量
    texts = work_manager.wait_allcomplete()
    savefile(_name + '_百度词频.txt', texts)
    print("threadPool cost all time: %s" % (time.time() - start), flush=True)


if __name__ == "__main__":
    main()
    # threadPool cost all time: 27.787729501724243

 

  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值