python爬取b站_python 爬取bilibili 视频信息

1 #-*- coding: utf-8 -*-

2 #@author: Tele

3 #@Time : 2019/04/08 下午 1:01

4 importrequests5 importjson6 importos7 importre8 importshutil9 from lxml importetree10

11

12 #爬取每个菜单的前5页内容

13 classBiliSplider:14 def __init__(self, save_dir, menu_list):15 self.target =menu_list16 self.url_temp = "https://www.bilibili.com/"

17 self.headers ={18 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",19 # "Cookie": "LIVE_BUVID=AUTO6715546997211617; buvid3=07192BD6-2288-4BA5-9259-8E0BF6381C9347193infoc; stardustvideo=1; CURRENT_FNVAL=16; sid=l0fnfa5e; rpdid=bfAHHkDF:cq6flbmZ:Ohzhw:1Hdog8",20 }21 self.proxies ={22 "http": "http://61.190.102.50:15845"

23 }24 self.father_dir =save_dir25

26 defget_menu_list(self):27 regex = re.compile("//")28 response = requests.get(self.url_temp, headers=self.headers)29 html_element =etree.HTML(response.content)30 nav_menu_list = html_element.xpath("//div[@id='primary_menu']/ul[@class='nav-menu']/li/a")31

32 menu_list =list()33 for item innav_menu_list:34 menu =dict()35 title = item.xpath("./*/text()")36 menu["title"] = title[0] if len(title) > 0 elseNone37 href = item.xpath("./@href")38 menu["href"] = "https://" + regex.sub("", href[0]) if len(href) > 0 elseNone39

40 #子菜单

41 submenu_list =list()42 sub_nav_list = item.xpath("./../ul[@class='sub-nav']/li")43 if len(sub_nav_list) >0:44 for sub insub_nav_list:45 submenu =dict()46 sub_title = sub.xpath("./a/span/text()")47 submenu["title"] = sub_title[0] if len(sub_title) > 0 elseNone48 sub_href = sub.xpath("./a/@href")49 submenu["href"] = "https://" + regex.sub("", sub_href[0]) if len(sub_href) > 0 elseNone50 submenu_list.append(submenu)51 menu["submenu_list"] = submenu_list if len(submenu_list) > 0 elseNone52 menu_list.append(menu)53 returnmenu_list54

55 #rid=tid

56 defparse_index_url(self, url):57 result_list =list()58 #正则匹配

59 regex = re.compile("")60 response = requests.get(url, headers=self.headers)61 result =regex.findall(response.content.decode())62 temp = re.compile("(.*);\(function").findall(result[0]) if len(result) > 0 elseNone63 sub_list = json.loads(temp[0])["config"]["sub"] if temp elselist()64 if len(sub_list) >0:65 for sub insub_list:66 #一些子菜单没有rid,需要请求不同的url,暂不处理

67 if "tid" insub:68 if sub["tid"]:69 sub_menu =dict()70 sub_menu["rid"] = sub["tid"] if sub["tid"] elseNone71 sub_menu["title"] = sub["name"] if sub["name"] elseNone72 result_list.append(sub_menu)73 else:74 pass

75

76 returnresult_list77

78 #最新动态 region?callback

79 #数据 newlist?callback

80 defparse_sub_url(self, item):81 self.headers["Referer"] = item["referer"]82 url_pattern = "https://api.bilibili.com/x/web-interface/newlist?rid={}&type=0&pn={}&ps=20"

83

84 #每个菜单爬取前5页

85 for i in range(1, 6):86 data =dict()87 url = url_pattern.format(item["rid"], i)88 print(url)89 try:90 response = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=10)91 except:92 return

93 if response.status_code == 200:94 data["content"] = json.loads(response.content.decode())["data"]95 data["title"] = item["title"]96 data["index"] =i97 data["menu"] = item["menu"]98 #保存数据

99 self.save_data(data)100 else:101 print("请求超时") #一般是403,被封IP了

102

103 defsave_data(self, data):104 if len(data["content"]) ==0:105 return

106 parent_path = self.father_dir + "/" + data["menu"] + "/" + data["title"]107 if notos.path.exists(parent_path):108 os.makedirs(parent_path)109 file_dir = parent_path + "/" + "第" + str(data["index"]) + "页.txt"

110

111 #保存

112 with open(file_dir, "w", encoding="utf-8") as file:113 file.write(json.dumps(data["content"], ensure_ascii=False, indent=2))114

115 defrun(self):116 #清除之前保存的数据

117 ifos.path.exists(self.father_dir):118 shutil.rmtree(self.father_dir)119

120 menu_list =self.get_menu_list()121 menu_info =list()122 #获得目标菜单信息

123 #特殊列表,一些菜单的rid必须从子菜单的url中获得

124 special_list =list()125 for menu inmenu_list:126 for t inself.target:127 if menu["title"] ==t:128 if menu["title"] == "番剧" or menu["title"] == "国创" or menu["title"] == "影视":129 special_list.append(menu)130 menu_info.append(menu)131 break

132

133 #目标菜单的主页

134 if len(menu_info) >0:135 for info inmenu_info:136 menu_index_url = info["href"]137 #处理特殊列表

138 if info inspecial_list:139 menu_index_url = info["submenu_list"][0]["href"]140 #获得rid

141 result_list =self.parse_index_url(menu_index_url)142 print(result_list)143 if len(result_list) >0:144 for item inresult_list:145 #大菜单

146 item["menu"] = info["title"]147 item["referer"] =menu_index_url148 #爬取子菜单

149 self.parse_sub_url(item)150

151

152 defmain():153 target = ["动画", "番剧", "国创", "音乐", "舞蹈", "游戏", "科技", "数码", "生活", "鬼畜", "时尚", "广告", "娱乐", "影视"]154 splider = BiliSplider("f:/bili_splider", target)155 splider.run()156

157

158 if __name__ == '__main__':159 main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值