lxml 爬取豆瓣top250
菜鸟奋战10小时,得到豆瓣top250
from time import sleep
import urllib3
import pandas as pd
urllib3.disable_warnings()
from lxml import etree
def getTree(url):
# print(url)
send_headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Connection": "keep-alive",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8"
}
pool_manager = urllib3.PoolManager()
res = pool_manager.request('GET', url, send_headers)
# 获取响应的数据并设置字符集编码
r = res.data.decode()
# print(r)
return etree.HTML(r)
def getPageInfo(url_page):
tree = getTree(url_page)
ol = tree.xpath('//ol[@class="grid_view"]')[0]
lis = ol.xpath('//li')
# print(len(lis))
# 总的名称列表
names = []
# 导演演员
infos =()
stars = []
nums = []
qutos = []
for li in lis:
name = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span/text()')
name_1 = [x.replace('\xa0/\xa0', '') for x in name]
names_final = ''.join(name_1)
p = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class]/text()')
star = li.xpath(
'div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
num = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[last()]/text()')
quto = li.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span/text()')
p_split = [e.split() for e in p]
info=(' ',' ',' ',' ')
if(len(p_split)>0):
director = ' '.join(p_split[0])
year_country_type = ' '.join(p_split[1])
# print(year_country_type)
year = year_country_type.split('/')[0].strip()
country = year_country_type.split('/')[1].strip()
type = year_country_type.split('/')[2].strip()
# info元组
info = (director, year, country, type)
if (len(names_final) > 0):
names.append(names_final)
if (info[0] != ' '):
infos += info
else:
infos += info
if (len(star) > 0):
stars.extend(star)
else:
stars.extend(' ')
if (len(num) > 0):
nums.extend(num)
else:
nums.extend(' ')
if (len(quto) > 0):
qutos.extend(quto)
else:
qutos.extend(' ')
else:
continue
# print(li,names_final)
# print(names)
# print(infos)
# print(stars)
# print(nums)
# print(qutos)
return names, infos, stars, nums, qutos
def main(url_home):
pages_count = 250 // 25
namess = []
infoss =()
starss = []
numss = []
qutoss = []
for i in range(pages_count):
page_url = url_home + '?start=' + str(25 * i)+'&filter='
# 间隔5s抓取
sleep(2)
names, infos, stars, nums, qutos = getPageInfo(page_url)
# print(i, names,infos,stars,nums,qutos,sep='\n',end='\n')
namess.extend(names)
infoss+=infos
starss.extend(stars)
numss.extend(nums)
qutoss.extend(qutos)
# print(i, namess,infoss,starss,numss,qutoss,sep='\n',end='\n')
actors = infoss[0::4]
years = infoss[1::4]
countries = infoss[2::4]
types = infoss[3::4]
data = {'namess': namess, 'actors': actors, 'years': years, 'countries': countries, 'types': types,
'starss': starss, 'numss': numss, 'qutoss': qutoss}
print(data)
df = pd.DataFrame(data)
df.to_excel('douban.xls')
df.to_csv('douban.csv')
print('finished!')
url_home = "https://movie.douban.com/top250"
if __name__ == '__main__':
main(url_home)
# getPageInfo(url_home)
效果