花点时间改良了一下代码。如下
import requests
from bs4 import BeautifulSoup
import pymongo
import lxml
import time, datetime
class douyu_host_info():
def __init__(self):
self.url_host = 'https://www.douyu.com'
self.date_time = datetime.datetime.now().strftime('%Y%m%d_%H%M')
self.url_list = []
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
}
# 类别后缀列表,由于是固定不变的,第一次抓取后直接保存起来
self.categorys_list =[
'/g_LOL', '/g_blzy', '/g_DOTA2', '/g_qipai', '/g_DNF', '/g_CF', '/g_mszb', '/g_CSGO', '/g_How', '/g_DOTA',
'/g_WOW', '/g_nsh', '/g_Overwatch', '/g_wxy', '/directory/category/PCgame', '/g_jdqs', '/g_TVgame',
'/g_gwlrsj', '/g_FTG', '/g_xyqx', '/g_NBA2K', '/g_BF', '/g_DG', '/directory/category/djry', '/g_wzry',
'/g_jdqscjzc', '/g_jdqsqjcj', '/g_qqfcsy', '/g_hyrz', '/g_xyzx', '/g_HLMJ', '/g_phone', '/g_LRSZQ',
'/g_mhmnz', '/g_CFSY', '/directory/category/syxx', '/g_yz', '/g_xingyu', '/g_ecy', '/g_yqk', '/g_HW',
'/g_ms', '/g_music', '/g_ip', '/directory/category/yl', '/g_smkj', '/g_yj', '/g_Finance', '/g_kepu',
'/g_js', '/g_car', '/g_jlp', '/g_tkx', '/directory/sport/cate', '/g_FM233', '/g_yydt', '/g_lianmaihudong',
'/g_qinggan', '/directory/category/voice', '/g_znl'
]
def Mongodb_set(self, sheet_name, r_data):
client = pymongo.MongoClient('localhost', 27017)
douyu = client['douyu']
sheet_name = douyu[sheet_name]
print(r_data)
sheet_name.insert_one(r_data)
def get_url_list(self):
for category in self.categorys_list:
category_url = self.url_host + category
self.url_list.append(category_url)
self.Mongodb_set(sheet_name='url_list', r_data={'url': category_url})
return self.url_list
def get_host_info(self, url):
time.sleep(0.2)
print('Now start open {}'.format(url))
for i in range(3):
try:
wb_data = requests.get(url, headers=self.headers)
break
except:
print('net work error! will retry 3 times')
soup = BeautifulSoup(wb_data.text, 'lxml')
print('start analazy url')
try:
category = soup.select('h1')[0].get_text()
except:
category = '未定義類別'
names = soup.select('.ellipsis.fl')
nums = soup.select('.dy-num.fr')
titles = soup.select('.mes h3')
hrefs = soup.select('#live-list-contentbox li a')
for name, num, href, title in zip(names, nums, hrefs, titles):
data = {
'類別': category,
'主播': name.get_text(),
'标题': title.get_text().split('\n')[-1].strip(),
'链接': self.url_host + href.get('href'),
'人氣指數': float(num.get_text()[:-1]) if '万'in num.get_text() else float(num.get_text())/10000,
'當前時間': self.date_time
}
if data['人氣指數'] > 2:
self.Mongodb_set(sheet_name='host_info_{}'.format(self.date_time), r_data=data)
def db_check(self, sheetname, key_word):
client = pymongo.MongoClient('localhost', 27017)
douyu = client['douyu']
sheetname = douyu[sheetname]
for data in sheetname.find(key_word):
print(data)
from multiprocessing import Pool
from douyu_host_2 import douyu_host_info
douyu = douyu_host_info()
def data_check():
#{u'當前時間':'20180901 10:58', u'人氣指數':{'$gte':30}}
#{'主播':'
# sheetname = input('Which sheet do you want to check')
sheetname = 'host_info_20180901_1530'
# key_word = input('Do you want to check with?')
key_word = {'類別': 'DOTA2'}
douyu.db_check(sheetname=sheetname, key_word=key_word)
def w_to_db():
pool = Pool()
url_list = douyu.get_url_list()
pool.map(douyu.get_host_info, url_list)
if __name__ == '__main__':
w_to_db()
data_check()
这个爬虫没有包含翻页,只爬取了每个类别下面的首页,翻页爬所有主播请参考这个脚本。
https://www.cnblogs.com/lkd8477604/p/9848958.html