附代码
直播首页数据
import requests
import flask
from bs4 import BeautifulSoup
from lxml import etree
def getInfoUtils(info):
if info:
return info[0]
else:
''
def getHtml(url):
response = requests.get(url)
try:
if response.status_code == 200:
content = response.text
return content
except Exception as e:
return e
def htmlToTree(html):
tree = etree.HTML(html)
return tree
def parseLi(li):
# content_dict = {}
li_tree = htmlToTree(li)
href = "https://www.douyu.com" + li_tree.xpath(
'//div[@class="DyListCover HeaderCell is-href"]/a[1]/@href')[0]
a_element = li_tree.xpath('//div[@class="DyListCover HeaderCell is-href"]/a[1]')[0]
a_tag = etree.tostring(a_element, pretty_print=True, encoding='utf-8').decode('utf-8')
a_tree = htmlToTree(a_tag)
dy_cover = a_tree.xpath(
'//div[@class ="DyListCover-content"]/div[@class="DyListCover-info"]')
content1 = etree.tostring(dy_cover[0], pretty_print=True, encoding='utf-8').decode(
'utf-8')
live_type = htmlToTree(content1).xpath('//span/text()')
live_type = getInfoUtils(live_type)
live_title = htmlToTree(content1).xpath('//h3/text()')
live_title = getInfoUtils(live_title)
content2 = etree.tostring(dy_cover[1], pretty_print=True, encoding='utf-8').decode(
'utf-8')
live_hot = htmlToTree(content2).xpath('//span/text()')
live_hot = getInfoUtils(live_hot)
live_name = htmlToTree(content2).xpath('//h2/div/text()')
live_name = getInfoUtils(live_name)
content_dict = {
'live_link': href,
'live_type': live_type,
'live_title': live_title,
'live_hot': live_hot,
'live_name': live_name
}
return content_dict
def parseHtml(html):
res_data = []
tree = etree.HTML(html)
path = '//main[@class="layout-Main"]//div[@class="layout-Module-container layout-Cover ' \
'ListContent"]/ul/li'
data = tree.xpath(path)
for item in data:
item = etree.tostring(item, pretty_print=True, encoding='utf-8').decode('utf-8')
res = parseLi(item)
res_data.append(res)
return res_data
def liveMain(url):
s = parseHtml(getHtml(url))
return s
if __name__ == '__main__':
url = 'https://www.douyu.com/directory/all'
data = liveMain(url)
print(data)
分类下的直播数据
import requests
from lxml import etree
from douyuLive import liveMain
def getHtml(url):
response = requests.get(url)
try:
if response.status_code == 200:
content = response.text
return content
except Exception as e:
return e
def htmlToTree(html):
tree = etree.HTML(html)
return tree
def parseCate(category):
category_dict = {}
category_list = []
category_tree = htmlToTree(category)
category_title = category_tree.xpath('//div[@class="categoryBox-head"]/h4/text()')
category_hrefs = category_tree.xpath('//ul[@class="layout-Classify-list"]/li/a/@href')
category_items = category_tree.xpath('//ul[@class="layout-Classify-list"]/li/a/strong/text()')
category_tatolhots = category_tree.xpath(
'//ul[@class="layout-Classify-list"]/li/a/div/span/text()')
if len(category_items) == len(category_tatolhots) == len(category_hrefs):
for i in range(len(category_items)):
info = {}
info['category_item'] = category_items[i]
info['category_href'] = "https://www.douyu.com" + category_hrefs[i]
info['category_tatolhot'] = category_tatolhots[i]
category_list.append(info)
category_dict["category_title"] = category_title[0]
category_dict["category_list"] = category_list
# print(category_dict)
return category_dict
def parseHtml(html):
data_info = []
tree = htmlToTree(html)
all_categories = tree.xpath('//main[@id="allCate"]/section[@class="layout-Module"]/div')
for i in range(2, len(all_categories)):
category = etree.tostring(all_categories[i], pretty_print=True,
encoding='utf-8').decode('utf-8')
data = parseCate(category)
data_info.append(data)
return data_info
def categoryMain(url):
html = getHtml(url=url)
data = parseHtml(html)
return data
def main(data):
for item in data:
category_list = item["category_list"]
for hrefs in category_list:
category_href = hrefs["category_href"]
info = liveMain(category_href)
print("*******************", hrefs["category_item"])
print(info)
if __name__ == '__main__':
url = "https://www.douyu.com/directory"
data = categoryMain(url)
print(data)
main(data)