python 多线程卡死跳出_【在线等】求大佬改进代码,Python多线程卡死问题求解决...

[Python] 纯文本查看 复制代码#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""

百度移动相关搜索词挖掘脚本(多线程版)

基于python3.8

需要安装requests模块

"""

import re

from queue import Queue

from threading import Thread

import requests,random

class Qh360Spider(Thread):

result = {} # 保存结果字典

seen = set() # 表示在队列中的关键词(已抓取或待抓取)

def __init__(self, kw_queue, loop, failed):

super(Qh360Spider, self).__init__()

self.kw_queue = kw_queue # 关键词队列

self.loop = loop # 循环挖词拓展次数

self.failed = failed # 保存查询失败的关键词文件

self.ua_list = [

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',

'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',

'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',

'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',

'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',

]

def run(self): # 程序的执行流程

while True:

# 从队列里面获取一个关键词及其对应的当前拓展次数

kw, cloop = self.kw_queue.get()

print('CurLoop:{} Checking: {}'.format(cloop, kw))

query = 'https://www.so.com/s?q={}'.format(kw) # 构建含关键词的url

try:

source = self.download(query, timeout=10)

# source = self.download(query,timeout=10,user_agent=self.ua)

if source:

kw_list = self.extract(source)

print(kw_list)

self.filter(cloop, kw_list)

else:

# 获取源码失败,保存查询失败的关键词

self.failed.write('{}\n'.format(kw))

finally:

#self.kw_queue.task_done()

pass

def download(self, url, timeout=5, proxy=None, num_retries=5):

"""

通用网页源码下载函数

:param url: 要下载的url

:param timeout: 请求超时时间,单位/秒。可能某些网站的反应速度很慢,所以需要一个连接超时变量来处理。

:param user_agent: 用户代{过}{滤}理信息,可以自定义是爬虫还是模拟用户

:param proxy: ip代{过}{滤}理(http代{过}{滤}理),访问某些国外网站的时候需要用到。必须是双元素元组或列表(‘ip:端口’,‘http/https’)

:param num_retries: 失败重试次数

:return: HTML网页源码

"""

headers = {

"Cookie": "QiHooGUID=41F80B0CCE5D43A22EEF0305A12CDE3F.1596003342506; __guid=15484592.2994995584481314300.1596003341831.5723; soid=TjzBKt3zrO-Rh1S7fXSb0S!6kmX5TlEerB2URZz9v4; __md=667cb161f9515972323507763d8fa7dd643a65bd2e88034.9; dpr=1; isafe=1; webp=1; _uc_m2=886a48052dbb9e2291f80055746e0d4f1f110f922b2f; _uc_mid=7cb161f953d8fa7dd643a65bd2e88034; __huid=11xZqhEl%2FfVeqclI4j%2BdQeQvX63Oph%2F%2BCVM5vxqYGxQI4%3D; Q=u%3Duhthb002%26n%3D%26le%3DAwH0ZGV5ZGR3WGDjpKRhL29g%26m%3DZGH5WGWOWGWOWGWOWGWOWGWOZGL0%26qid%3D144048053%26im%3D1_t018c25fbb66797efb2%26src%3D360chrome%26t%3D1; T=s%3D2afa764886f737dd5d23421c30f87a1f%26t%3D1595934758%26lm%3D0-1%26lf%3D2%26sk%3De485bbde46ac34fc27fc40215de76c44%26mt%3D1595934758%26rc%3D1%26v%3D2.0%26a%3D1; _S=tg75a7e3fmv0mfdfkt8jlpfpj6; stc_ls_sohome=RRzRSR!RTR(RUR_RVR; gtHuid=1; homeopenad=1; _pp_wd=1; _ga=GA1.2.607533084.1598082638; _gid=GA1.2.1887117715.1598082638; count=6; erules=p1-9%7Cp2-11%7Cp4-3%7Cecl-2%7Ckd-1%7Cp3-2",

'User-Agent': random.choice(self.ua_list)

}

try:

# 打开网页并读取内容存入html变量中

resp = requests.get(url, headers=headers, proxies=proxy, timeout=timeout)

print(resp.status_code)

except requests.RequestException as err:

print('Download error:', err)

html = None # 如果有异常,那么html肯定是没获取到的,所以赋值None

if num_retries > 0:

return self.download(url, timeout, proxy, num_retries - 1)

else:

html = resp.content.decode('utf-8')

#print(html)

return html

@staticmethod

def extract(html):

'''

提取关键词

:param html:搜索结果源码

:return:提取出来的相关关键词列表

'''

return re.findall(r'(.+?)', html, re.S | re.I)

def filter(self, current_loop, kwlist):

'''

关键词过滤和统计函数

:param current_loop: 当前拓展的次数

:param kwlist: 提取出来的关键词列表

:return: None

'''

for kw in kwlist:

# 判断关键词是不是已经被抓取或者已经存在关键词队列

# 判断当前的拓展次数是否已经超过指定值

if current_loop < self.loop and kw not in self.seen:

# 同时满足关键词的拓展次数小于目标次数,而且关键词不在seen里面时才把kw放到待抓取队列内

self.kw_queue.put((kw, current_loop+1))

Qh360Spider.seen.add(kw)

# 将关键词放到结果字典内,并统计出现次数

if kw in self.result:

Qh360Spider.result[kw] += 1

else:

Qh360Spider.result[kw] = 1

if __name__ == '__main__':

if __name__ == '__main__':

# 创建关键词队列实例

k_queue = Queue()

# 将待抓取关键词放入队列已经类的seen属性中

with open('keywords.txt', encoding="GBK") as kwfile:

for key in kwfile:

key = key.strip()

k_queue.put((key, 1))

Qh360Spider.seen.add(key)

# 创建查询失败保存文件

check_failed = open('faileds.txt', 'w')

# 创建线程

for i in range(15):

bds = Qh360Spider(k_queue, 3, check_failed)

bds.setDaemon(True)

bds.start()

# 阻塞关键词队列,直到完成

k_queue.join()

# 关闭查询失败的文件

check_failed.close()

# 对结果进行排序及写入文件

sort_list = sorted(Qh360Spider.result.items(), key=lambda x: x[1], reverse=True)

with open('ah360key.txt', 'w', encoding='utf8') as save:

for item in sort_list:

# 关键词+次数的文件输出方式

line = '%s\n' % (item[0])

if len(line) > 0:

print("有东西")

print('111')

save.write(line)

save.flush() # 刷新缓存,避免中途出错

save.close()

print('done,完成挖掘')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值