import requests
import pandas as pd
from bs4 import BeautifulSoup
import os
import time
import random
from lxml import etree
def get_html_text(url):
"""
:rtype: object
"""
# 用户代理
headers = [
{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/35.0.1916.153 Safari/537.36"},
{"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},
{"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"}
]
head = random.choice(headers)
# ip代理
proxies = [
{"http": "123.206.25.108:808"},
{"http": "61.150.96.27:36880"},
{"http": "1.198.73.42:9999"},
]
proxie = random.choice(proxies)
try:
r = requests.get(url, timeout=30, headers=head, proxies=proxie)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
return e
def get_live_infolist(url, live_info_lists):
# 利用Xpath进行网页查找
html = get_html_text(url)
dom = etree.HTML(html)
current_hots = dom.xpath('//li/div/a/div[@class="DyListCover-content"]/div[@class="DyListCover-info"]'
'/span[@class="DyListCover-hot is-template"]/text()')
live_users = dom.xpath('//li/div/a/div[@class="DyListCover-content"]/div[@class="DyListCover-info"]'
'/h2[@class="DyListCover-user is-template"]/text()')
live_zones = dom.xpath('//li/div/a/div[@class="DyListCover-content"]/div[@class="DyListCover-info"]'
'/span[@class="DyListCover-zone"]/text()')
print(live_zones)
print(live_users)
print(current_hots)
live_info_lists.append(current_hots)
live_info_lists.append(live_users)
live_info_lists.append(live_zones)
return live_info_lists
def save_live_info(live_info_lists):
DataSet = list(zip(live_info_lists[0], live_info_lists[1], live_info_lists[2]))
df = pd.DataFrame(data=DataSet, columns=['分区', '主播', '实时热度'])
print(df)
try:
df.to_csv("douyu_data.csv", mode="a+", encoding="gb18030")
except Exception as e:
print(e)
def main():
url = 'https://www.douyu.com/g_wzry'
list1 = []
get_html_text(url)
get_live_infolist(url, list1)
save_live_info(list1)
if __name__ == "__main__":
main()
利用xpath爬取斗鱼主播热度和房间标题
最新推荐文章于 2023-03-31 16:53:35 发布