标题 Python爬取豆瓣一周口碑榜
@author XGan 2019/11/09 周六
获取豆瓣电影网一周口碑榜
网址:https://movie.douban.com/
python requests库以及xpath实现
import requests
import random
from lxml import etree
import json as josn
import encodings
head = [
“Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36”,
“Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36”
]
def gethtml(url):
res = requests.get(url,headers={
‘User-Agent’:random.choice(head)
})
html = etree.HTML(res.text)
return html
#获取的xpath
#电影标题 //div[@class=‘billboard-bd’]//td/a/text()
#电影链接//div[@id=‘billboard’]/div[@class=‘billboard-bd’]//td//a/@href
def html_process(html):
titles = html.xpath("//div[@id=‘billboard’]/div[@class=‘billboard-bd’]//tr//a")
# href = titles.xpath(’/@href’).extract()[0]
# print(href)
with open(’…/aliyun/一周热映.josn’,‘w+’) as f:
print(’’ * 30 + “开始保存josn文件完成” + '’ * 30)
for title in titles:
# 获取当前节点文本
title10 = title.xpath(’./text()’)[0]
# 获取当前节点的所有属性
href = title.xpath(’./attribute::’)[1]
dict ={title10:href}
f.write(str(dict)+"\n")
print(’‘30+“保存josn文件完成”+’’*30)
if name ==“main”:
url = “https://movie.douban.com/”
http = gethtml(url)
html_process(http)
运行结果
josn中查看
-------------blog 2019 /11/12 周二 修改样式