百度域名多线程采集

最新推荐文章于 2022-03-31 18:36:40 发布

爬到你心上

最新推荐文章于 2022-03-31 18:36:40 发布

阅读量306

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/weixin_38310652/article/details/88962617

版权

python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time
import threading

def spider(num,keyword):
for sum in range(0, 800, 100):
urlSearch = ‘https://www.baidu.com/s?wd=’ + keyword + ‘&pn=’ + str(num * 10 + sum)
headers = {
‘User-Agent’: ‘Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/72.0.3626.121Safari/537.36’}
try:
res = requests.get(url=urlSearch, headers=headers, timeout=3)
time.sleep(1)
soup = BeautifulSoup(res.text, ‘lxml’)
urlLinks = soup.find_all(“h3”)

        for urlLink in urlLinks:
            urlEncode = urlLink.a.attrs['href']
            try:
                resUrl = requests.get(url=urlEncode, headers=headers, timeout=3)
                urlReal = urlparse(resUrl.url)
                urlDomain = urlReal.netloc
                print(urlDomain)
                fileDomain.write(urlDomain + '\n')
                fileDomain.flush()
            except:
                pass
    except:
        pass

def threadingrun(keyword):
threadingList = []

for i in range(10):
    t = threading.Thread(target=spider,args=(i,keyword,))
    threadingList.append(t)
for j in threadingList:
    j.start()
    time.sleep(0.001)

if name == ‘main’:
file = open(‘baidukeyword.txt’, ‘r’, encoding=‘utf-8’) #设定好的字典档
fileDomain = open(‘filedomain.txt’, ‘a+’, encoding=‘utf-8’) #爬取得到的名
filePrint = open(‘fileprint.txt’, ‘w’, encoding=‘utf-8’) #已截取过後的关键字
for keyword in file.readlines():
keyword = keyword.strip()
print(’=’ * 30 + keyword + ‘=’ * 30)
filePrint.write(keyword + ‘\n’)
filePrint.flush()
threadingrun(keyword)
time.sleep(20)

爬到你心上

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
百度域名多线程采集

import requestsfrom bs4 import BeautifulSoupfrom urllib.parse import urlparseimport timeimport threadingdef spider(num,keyword):for sum in range(0, 800, 100):urlSearch = ‘https://www.baidu.com/...
复制链接

扫一扫