豆瓣电影

#!/usr/bin/env python

encoding=utf-8

import requests
import re
import codecs
from bs4 import BeautifulSoup
from openpyxl import Workbook
#实例化
wb = Workbook()
#激活worksheet
ws1 = wb.active
ws1.title = “电影top250”
dest_filename = ‘电影.xlsx’
DOWNLOAD_URL = ‘http://movie.douban.com/top250/’

def download_page(url):
“”“获取url地址页面内容”""
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36’
}
data = requests.get(url, headers=headers).content
return data

def get_li(doc):
soup = BeautifulSoup(doc, ‘html.parser’)
ol = soup.find(‘ol’, class_=‘grid_view’)
name = [] # 名字
star_con = [] # 评价人数
score = [] # 评分
info_list = [] # 短评
for i in ol.find_all(‘li’):
detail = i.find(‘div’, attrs={‘class’: ‘hd’})
movie_name = detail.find(
‘span’, attrs={‘class’: ‘title’}).get_text() # 电影名字
level_star = i.find(
‘span’, attrs={‘class’: ‘rating_num’}).get_text() # 评分
star = i.find(‘div’, attrs={‘class’: ‘star’})
star_num = str(star.find(text=re.compile(‘评价’))) # 评价

    info = i.find('span', attrs={'class': 'inq'})  # 短评
    if info:  # 判断是否有短评
        info_list.append(info.get_text())
    else:
        info_list.append('无')
    score.append(level_star)

    name.append(movie_name)
    star_con.append(star_num)
page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
# print(page,page['href'])
if page:
    return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
return name, star_con, score, info_list, None

def main():
url = DOWNLOAD_URL
name = []
star_con = []
score = []
info = []
while url:
doc = download_page(url)
movie, star, level_num, info_list, url = get_li(doc)
name = name + movie
star_con = star_con + star
score = score + level_num
info = info + info_list
print(’#######’)
for (i, m, o, p) in zip(name, star_con, score, info):
col_A = ‘A%s’ % (name.index(i) + 1)
col_B = ‘B%s’ % (name.index(i) + 1)
col_C = ‘C%s’ % (name.index(i) + 1)
col_D = ‘D%s’ % (name.index(i) + 1)
print(col_A,col_B,col_C,col_D,(name.index(i) + 1),i)
ws1[col_A] = i
ws1[col_B] = m
ws1[col_C] = o
ws1[col_D] = p
wb.save(filename=dest_filename)

if name == ‘main’:
main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值