爬虫一:豆瓣电影关键字爬取
爬虫一:豆瓣电影关键字爬取 并存入excel
https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%83%82%E7%89%87&start=20
豆瓣电影信息还是比较好拿到的
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
"Cookie": 'll="118245"; bid=ARtOOxDRteM; __utma=30149280.1180608368.1569232683.1569232683.1569232683.1; __utmc=30149280; __utmz=30149280.1569232683.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; acw_tc=2760828115692326854383671ef4946949a1588e4e0476180bf57b1e5f944d; ap_v=0,6.0; __utma=223695111.1224307206.1569232715.1569232715.1569232715.1; __utmb=223695111.0.10.1569232715; __utmc=223695111; __utmz=223695111.1569232715.1.1.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/passport/login; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1569232715%2C%22https%3A%2F%2Faccounts.douban.com%2Fpassport%2Flogin%3Fredir%3Dhttps%253A%252F%252Fmovie.douban.com%252F%22%5D; _pk_ses.100001.4cf6=*; push_noty_num=0; push_doumail_num=0; __yadk_uid=jOpNK0YqTHKGC26G4EsFha6rAzJzvStK; _vwo_uuid_v2=D2DBA0AE8FBA3AA58155A7BB3CF5E42B0|0fc9726cdfeda577fffd4d9b62d1989e; dbcl2="199182842:8rksCNklcW8"; ck=Taef; __utmv=30149280.19918; _pk_id.100001.4cf6=62084c1c85a029b2.1569232715.1.1569233089.1569232715.; __utmb=30149280.17.10.1569232683'
}
首先定义headers请求头 让网站认为我们是人为的访问
content = requests.get(url, headers=headers)
content_json = json.loads(content.text)["data"]
for one_info in content_json:
one_id = one_info["id"]
print(one_id)
url2 = "https://movie.douban.com/subject/%s/"%one_id
# content_html = requests.get(url, headers=headers)
html = requests.get(url2, headers=headers)
if html.status_code == 200:
content = html.content.decode("utf-8")
content = etree.HTML(content)
save_info(content)
time.sleep(1)
发出请求 得到html数据
info = content.xpath("//div[@id='info']")[0]
try:
name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
except:
name = "无"
try:
daoyan = str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
except:
daoyan = "无"
try:
bianju = str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
except:
bianju = "无"
try:
zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
except:
zhuyan = "无"
try:
leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
except:
leixing = "无"
try:
shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")
except:
shangyingshijian = "无"
try:
shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
except:
shichang = "无"
try:
pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
except:
pingfen = "无"
try:
jianjie = str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")
except:
jianjie = "无"
# tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")
try:
pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
except:
pingjiarenshu = "无"
print("电影名称:", name)
print("导演:", daoyan)
print("编剧:", bianju)
print("主演:", zhuyan)
print("评分:", pingfen)
print("评价人数:", pingjiarenshu)
print("类型:", leixing)
print("上映时间:", shangyingshijian)
print("时长:", shichang)
print("简介:", jianjie)
# print("图片url:", tupian)
one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]
all_list.append(one_info)
通过xpath提取数据
def processing_data(content_list):
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
for i, content in enumerate(content_list):
for x, info in enumerate(content):
worksheet.write(i, x, label=info) # 将数据存入excel
# 保存
workbook.save('电影信息.xls')
将数据存入excel
完整代码
import time
import xlwt
from lxml import etree
import requests
import json
def processing_data(content_list):
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
for i, content in enumerate(content_list):
for x, info in enumerate(content):
worksheet.write(i, x, label=info) # 将数据存入excel
# 保存
workbook.save('电影信息.xls')
def save_info(content):
info = content.xpath("//div[@id='info']")[0]
try:
name = str(content.xpath('//*[@id="content"]/h1/span[1]/text()')[0]).replace("'", " ")
except:
name = "无"
try:
daoyan = str(info.xpath("./span[1]/span[2]/a/text()")[0] if info.xpath("./span[1]/span[2]/a/text()") else None ).replace("'", " ")
except:
daoyan = "无"
try:
bianju = str(info.xpath("./span[2]/span[2]/a/text()")[0] if info.xpath("./span[2]/span[2]/a/text()") else None).replace("'", " ")
except:
bianju = "无"
try:
zhuyan = '/'.join(info.xpath("./span[3]/span[2]/a/text()")).replace("'", " ")
except:
zhuyan = "无"
try:
leixing = '/'.join(info.xpath("./span[@property='v:genre']/text()")).replace("'", " ")
except:
leixing = "无"
try:
shangyingshijian= '/'.join(info.xpath(".//span[@property='v:initialReleaseDate']/text()")).replace("'", " ")
except:
shangyingshijian = "无"
try:
shichang = str(info.xpath(".//span[@property='v:runtime']/text()")[0]).replace("'", " ")
except:
shichang = "无"
try:
pingfen = str(content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]).replace("'", " ")
except:
pingfen = "无"
try:
jianjie = str(content.xpath('// *[ @ id = "link-report"] / span[1]/text()')[0]).replace("'", " ")
except:
jianjie = "无"
# tupian = str(content.xpath('//*[@id="mainpic"]/a/img/@src')[0]).replace("https://", "")
try:
pingjiarenshu = content.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
except:
pingjiarenshu = "无"
print("电影名称:", name)
print("导演:", daoyan)
print("编剧:", bianju)
print("主演:", zhuyan)
print("评分:", pingfen)
print("评价人数:", pingjiarenshu)
print("类型:", leixing)
print("上映时间:", shangyingshijian)
print("时长:", shichang)
print("简介:", jianjie)
# print("图片url:", tupian)
one_info = [name, daoyan, bianju, zhuyan, pingfen, pingjiarenshu,leixing, shangyingshijian, shichang, jianjie]
all_list.append(one_info)
def main():
try:
for x in range(0,60):
url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%83%82%E7%89%87&start='+ str(x*20)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
content = requests.get(url, headers=headers)
content_json = json.loads(content.text)["data"]
for one_info in content_json:
one_id = one_info["id"]
print(one_id)
url2 = "https://movie.douban.com/subject/%s/"%one_id
# content_html = requests.get(url, headers=headers)
html = requests.get(url2, headers=headers)
if html.status_code == 200:
content = html.content.decode("utf-8")
content = etree.HTML(content)
save_info(content)
time.sleep(1)
except:
processing_data(all_list)
if __name__ == '__main__':
all_list = []
main()
processing_data(all_list)
本人在校学生欢迎来交流python技术
QQ:5834135