python多线程实现访问页面_python 多线程/多进程爬取关联网页信息

最新推荐文章于 2022-05-15 23:34:44 发布

weixin_39622587

最新推荐文章于 2022-05-15 23:34:44 发布

阅读量239

点赞数

文章标签： python多线程实现访问页面

[Python] 纯文本查看复制代码import requests

import time

import re

from fake_useragent import UserAgent

import multiprocessing as mp

import threading

from queue import Queue

#设置headers

ua = UserAgent(verify_ssl=False)

headers = {

"User-Agent": ua.random,

}

#获取网页文本

def getpage(url):

try:

res = requests.get(url, headers, timeout = 30)

res.encoding = res.apparent_encoding

text = res.text

except:

text = ''

return text

#获取网页状态码

def getcode(url):

try:

res = requests.get(url, headers, timeout = 30)

code = res.status_code

except:

code = 0

return code

# 获取网页中文文本

def getchtext(url,q):

text = getpage(url)

chinese_pattern = '[\u4e00-\u9fa5]+'

chtexts = re.findall(chinese_pattern, text)

chtexts = list(set(chtexts)) #去重

chtexts = " ".join(chtexts)

q.put(chtexts)

#获取网页及关联网页信息

if __name__ == '__main__':

start = time.time()

links = ['http://www.fourd.cn', 'http://www.tfsea.com.cn', 'http://www.csscwshi.com' ,'http://www.marina-zh.com/', 'http://www.gdmoko.cn']

for rawlink in links:

rawtext = getpage(rawlink) # 获取网页信息

#corpchtext = getchtext(rawlink) # 获取网页中文文本信息

link_list = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", rawtext) # 获取网页内含链接

link_list = list(set(link_list)) # 去除重复项

#获取网页内可访问链接

csurls = []

for link in link_list:

if getcode(link) == 200:

csurl = link

elif getcode('http://' + link) == 200:

csurl = 'http://' + link

elif getcode('https://' + link) == 200:

csurl = 'https://' + link

elif getcode(rawlink + '//' + link) == 200:

csurl = rawlink + link

else:

csurl =""

csurls.append(csurl)

csurls = [x for x in csurls if x != '']

print('关联网址为：\n',csurls)

#多线程模式运行

q = Queue()

threads = []

for i in range(len(csurls)):

t = threading.Thread(target= getchtext, args=(csurls[i], q))

t.start()

threads.append(t)

for thread in threads:

thread.join()

res =[]

for _ in range(len(csurls)):

res.append(q.get())

#汇总所有信息

corptext = " ".join(res) #+ corpchtext

print('公司网页信息汇总：\n',corptext)

end = time.time()

cost = end - start

print('共花费{}s'.format(cost))

#弹窗提醒

from tkinter import messagebox

messagebox.showinfo("提示","信息采集完成！")

weixin_39622587

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python多线程实现访问页面_python 多线程/多进程爬取关联网页信息

[Python] 纯文本查看复制代码import requestsimport timeimport refrom fake_useragent import UserAgentimport multiprocessing as mpimport threadingfrom queue import Queue#设置headersua = UserAgent(verify_ssl=False)h...
复制链接

扫一扫