python多线程实现访问页面_python 多线程/多进程 爬取关联网页信息

[Python] 纯文本查看 复制代码import requests

import time

import re

from fake_useragent import UserAgent

import multiprocessing as mp

import threading

from queue import Queue

#设置headers

ua = UserAgent(verify_ssl=False)

headers = {

"User-Agent": ua.random,

}

#获取网页文本

def getpage(url):

try:

res = requests.get(url, headers, timeout = 30)

res.encoding = res.apparent_encoding

text = res.text

except:

text = ''

return text

#获取网页状态码

def getcode(url):

try:

res = requests.get(url, headers, timeout = 30)

code = res.status_code

except:

code = 0

return code

# 获取网页中文文本

def getchtext(url,q):

text = getpage(url)

chinese_pattern = '[\u4e00-\u9fa5]+'

chtexts = re.findall(chinese_pattern, text)

chtexts = list(set(chtexts)) #去重

chtexts = " ".join(chtexts)

q.put(chtexts)

#获取网页及关联网页信息

if __name__ == '__main__':

start = time.time()

links = ['http://www.fourd.cn', 'http://www.tfsea.com.cn', 'http://www.csscwshi.com' ,'http://www.marina-zh.com/', 'http://www.gdmoko.cn']

for rawlink in links:

rawtext = getpage(rawlink) # 获取网页信息

#corpchtext = getchtext(rawlink) # 获取网页中文文本信息

link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", rawtext) # 获取网页内含链接

link_list = list(set(link_list)) # 去除重复项

#获取网页内可访问链接

csurls = []

for link in link_list:

if getcode(link) == 200:

csurl = link

elif getcode('http://' + link) == 200:

csurl = 'http://' + link

elif getcode('https://' + link) == 200:

csurl = 'https://' + link

elif getcode(rawlink + '//' + link) == 200:

csurl = rawlink + link

else:

csurl =""

csurls.append(csurl)

csurls = [x for x in csurls if x != '']

print('关联网址为:\n',csurls)

#多线程模式运行

q = Queue()

threads = []

for i in range(len(csurls)):

t = threading.Thread(target= getchtext, args=(csurls[i], q))

t.start()

threads.append(t)

for thread in threads:

thread.join()

res =[]

for _ in range(len(csurls)):

res.append(q.get())

#汇总所有信息

corptext = " ".join(res) #+ corpchtext

print('公司网页信息汇总:\n',corptext)

end = time.time()

cost = end - start

print('共花费{}s'.format(cost))

#弹窗提醒

from tkinter import messagebox

messagebox.showinfo("提示","信息采集完成!")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值