import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time
import threading
def spider(num,keyword):
for sum in range(0, 800, 100):
urlSearch = ‘https://www.baidu.com/s?wd=’ + keyword + ‘&pn=’ + str(num * 10 + sum)
headers = {
‘User-Agent’: ‘Mozilla/5.0(WindowsNT6.1;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/72.0.3626.121Safari/537.36’}
try:
res = requests.get(url=urlSearch, headers=headers, timeout=3)
time.sleep(1)
soup = BeautifulSoup(res.text, ‘lxml’)
urlLinks = soup.find_all(“h3”)
for urlLink in urlLinks:
urlEncode = urlLink.a.attrs['href']
try:
resUrl = requests.get(url=urlEncode, headers=headers, timeout=3)
urlReal = urlparse(resUrl.url)
urlDomain = urlReal.netloc
print(urlDomain)
fileDomain.write(urlDomain + '\n')
fileDomain.flush()
except:
pass
except:
pass
def threadingrun(keyword):
threadingList = []
for i in range(10):
t = threading.Thread(target=spider,args=(i,keyword,))
threadingList.append(t)
for j in threadingList:
j.start()
time.sleep(0.001)
if name == ‘main’:
file = open(‘baidukeyword.txt’, ‘r’, encoding=‘utf-8’) #设定好的字典档
fileDomain = open(‘filedomain.txt’, ‘a+’, encoding=‘utf-8’) #爬取得到的名
filePrint = open(‘fileprint.txt’, ‘w’, encoding=‘utf-8’) #已截取过後的关键字
for keyword in file.readlines():
keyword = keyword.strip()
print(’=’ * 30 + keyword + ‘=’ * 30)
filePrint.write(keyword + ‘\n’)
filePrint.flush()
threadingrun(keyword)
time.sleep(20)