我爱哔哩哔哩
总结:
①正则是盲点,最近在想办法努力突破
②python提取xml文件中的字段,可用beautifulsoup包,nice
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 1 18:11:56 2020
@author: Administrator
"""
import requests
import json
import re
from lxml import etree
from bs4 import BeautifulSoup
# 获取b站谋一个小目录类的特定时间段内的热门视频的全部弹幕
class Bili():
def __init__(self):
self.url_temp = "https://s.search.bilibili.com/cate/search?&main_ver=v3&search_type=video&view_type=hot_rank&order=click©_right=-1&cate_id=28&page={}&pagesize=20&jsonp=jsonp&time_from=20200224&time_to=20200302&_=1583140287400"
self.headers = {
"user-agent": "Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36"}
self.dm_url_part = "https://api.bilibili.com/x/v1/dm/list.so?oid={}"
def parse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode(encoding="utf-8")
def get_content(self, url_str):
json_str = json.loads(url_str)
ret_list = json_str["result"]
#print(ret_list)
content_list = []
for ret in ret_list:
item = {}
item["author"] = ret["author"]
item["poster-img"] = ret["pic"]
item["playNum"] = ret["play"]
item["dmNum"] = ret["video_review"]
item["href"] = ret["arcurl"]
item["rank"] = ret["rank_offset"]
item["dm"] = self.get_dm(item["href"])
content_list.append(item)
page = json_str["page"]
total_page = json_str["numPages"]
return content_list, page, total_page
def get_dm(self, detail_url):
detail_url = self.parse_url(detail_url)
cid = re.search(r'"cid":(\d+)',detail_url).group(1)
dm_url = self.dm_url_part.format(cid)
dm_str = self.parse_url(dm_url)
#beautifulsoup使用不熟练
dm_list = BeautifulSoup(dm_str,"lxml").findAll("d")
dm_total = []
#这里可以完善的三元符未很好掌握
for dm in dm_list:
dm_new = [re.sub(r'<[^>]*>',"",str(dm))]
dm_total.extend(dm_new)
return dm_total
def save_content(self, conent_list, page):
file_path = "b站" + str(page) + ".txt"
with open(file_path, "a",encoding="utf-8") as f:
for content in conent_list:
f.write(json.dumps(content, ensure_ascii=False, indent=2))
f.write("\n")
print("save successfully")
def run(self):
page = 1
total_page = 2
while page != total_page:
url_temp = self.url_temp.format(page)
url_str = self.parse_url(url_temp)
# 1.创建首页url地址
conent_list, page, total_page = self.get_content(url_str)
# 2.发出请求,获取数据
# 2.1 获取页面数据
self.save_content(conent_list, page)
# 3.提取数据
# 4.存储数据
# 5.获取下一页地址,
# 6重复2-5操作
if __name__== "__main__":
bili = Bili()
bili.run()
-----------------视频学习 小酥仙 2020.3.3 -----------------------------