百度域名多线程采集

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time
import threading

def spider(num,keyword):
for sum in range(0, 800, 100):
urlSearch = ‘https://www.baidu.com/s?wd=’ + keyword + ‘&pn=’ + str(num * 10 + sum)
headers = {
‘User-Agent’: ‘Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/72.0.3626.121Safari/537.36’}
try:
res = requests.get(url=urlSearch, headers=headers, timeout=3)
time.sleep(1)
soup = BeautifulSoup(res.text, ‘lxml’)
urlLinks = soup.find_all(“h3”)

        for urlLink in urlLinks:
            urlEncode = urlLink.a.attrs['href']
            try:
                resUrl = requests.get(url=urlEncode, headers=headers, timeout=3)
                urlReal = urlparse(resUrl.url)
                urlDomain = urlReal.netloc
                print(urlDomain)
                fileDomain.write(urlDomain + '\n')
                fileDomain.flush()
            except:
                pass
    except:
        pass

def threadingrun(keyword):
threadingList = []

for i in range(10):
    t = threading.Thread(target=spider,args=(i,keyword,))
    threadingList.append(t)
for j in threadingList:
    j.start()
    time.sleep(0.001)

if name == ‘main’:
file = open(‘baidukeyword.txt’, ‘r’, encoding=‘utf-8’) #设定好的字典档
fileDomain = open(‘filedomain.txt’, ‘a+’, encoding=‘utf-8’) #爬取得到的名
filePrint = open(‘fileprint.txt’, ‘w’, encoding=‘utf-8’) #已截取过後的关键字
for keyword in file.readlines():
keyword = keyword.strip()
print(’=’ * 30 + keyword + ‘=’ * 30)
filePrint.write(keyword + ‘\n’)
filePrint.flush()
threadingrun(keyword)
time.sleep(20)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值