背景
爬虫最常见的爬去方式就是requests+etree的方式了,用豆瓣电影的信息来做简单案例
url: https://movie.douban.com/cinema/later/beijing/
代码
import requests
from lxml import etree
import pandas as pd
class dangdang_home(object):
def __init__(self):
self.url = 'https://movie.douban.com/cinema/later/beijing/'
def spider(self):
#浏览器标识
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
resp = requests.get(self.url,headers=headers)
data = etree.HTML(resp.text)
# print(data)
return data
def parse_data(self,data):
movie_info = []
movie_list = data.xpath('//*[@id="showing-soon"]/div/div')
for movie in movie_list:
title = movie.xpath('h3/a/text()')[0]
#print(title)
info_list = movie.xpath('ul/li/text()')
#print(info_list[0])
time = info_list[0]
type = info_list[1]
country = info_list[2]
wanted = movie.xpath('ul/li[4]/span/text()')[0]
#想看人数数据截取
wanted = (wanted[-4::-1])[::-1]
movie_info.append({
"title":title,
"time":time,
"type":type,
"country":country,
"wanted":wanted
})
return movie_info
if __name__ == "__main__":
dangdang = dangdang_home()
data = dangdang.spider()
movie_list = dangdang.parse_data(data)
#按照想看人数逆序排序
movie_list = sorted(movie_list,key=lambda x:int(x['wanted']),reverse=True)
for data in movie_list:
print(data)
#数据存入csv文件
df = pd.DataFrame(movie_list)
df.to_csv("movie.csv")
#df.to_excel()