#!/usr/bin/env python
# encoding=utf-8
import requests
import re
from bs4 import BeautifulSoup
from openpyxl import Workbook
import logging,dominate,os
from dominate.tags import *
import datetime
import xlrd
import sched
import time
wb = Workbook()
dest_filename = 'movies.xlsx'
ws1 = wb.active
ws1.title = "movies-top-250"
DOWNLOAD_URL = 'http://movie.douban.com/top250/'
def download_page(url):
"""获取url地址页面内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}
data = requests.get(url, headers=headers).content
return data
def get_li(doc):
soup = BeautifulSoup(doc, 'html.parser')
ol = soup.find('ol', class_='grid_view')
name = [] # 名字
star_con = [] # 评价人数
score = [] # 评分
info_list = [] # 短评
for i in ol.find_all('li'):
detail = i.find('div', attrs={'class': 'hd'})
movie_name = detail.find(
'span', attrs={'class': 'title'}).get_text() # 电影名字
level_star = i.find(
'span', attrs={'class': 'rating_num'}).get_text() # 评分
star = i.find('div', attrs={'class': 'star'})
star_num = star.find(text=re.compile('评价')) # 评价
info = i.find('span', attrs={'class': 'inq'}) # 短评
if info: # 判断是否有短评
info_list.append(info.get_text())
else:
info_list.append('无')
score.append(level_star)
name.append(movie_name)
star_con.append(star_num)
page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页
if page:
return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
return name, star_con, score, info_list, None
# 创建excel生成静态html页面的函数
def list_diction_to_html(list_work):
# 用dominate函数生成静态html页面
dt = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
title = dt + '-excel-to-html'
print(title)
doc = dominate.document(title=title)
# 写在头部的 css 可以自定义自己的想要用的css文件, (重要: meta一定要加 要不会在打开html时乱码,因为html默认不是utf-8编码)
with doc.head:
link(rel='stylesheet', href='https://cdn.staticfile.org/twitter-bootstrap/3.3.7/css/bootstrap.min.css')
meta(charset='utf-8')
# 创建一个table,将获取到的数据通过遍历添加进去对应的位置
with doc:
with div(id='excel_table').add(table()):
with thead():
dict = list_work[0]
for key in dict.keys():
table_header = td()
table_header.add(p(key))
for dict2 in list_work:
table_row = tr(cls='excel_table_row')
for key in dict2:
with table_row.add(td()):
p(dict2[key])
return str(doc)
# 保存生成后html的函数
def save_dom_to_html(dom):
filepath = os.path.abspath("index.html")
htmfile = open(filepath, "w")
htmfile.write(dom)
htmfile.close()
return filepath
def get_movies_to_excel():
url = DOWNLOAD_URL
name = []
star_con = []
score = []
info = []
while url:
doc = download_page(url)
print(url)
print(doc)
movie, star, level_num, info_list, url = get_li(doc)
name = name + movie
star_con = star_con + star
score = score + level_num
info = info + info_list
# print(info)
for (i, m, o, p) in zip(name, star_con, score, info):
col_A = 'A%s' % (name.index(i) + 1)
col_B = 'B%s' % (name.index(i) + 1)
col_C = 'C%s' % (name.index(i) + 1)
col_D = 'D%s' % (name.index(i) + 1)
ws1[col_A] = i
ws1[col_B] = m
ws1[col_C] = o
ws1[col_D] = p
wb.save(filename= dest_filename )
# 创建获取excel数据的函数
def excel_sheet_processor(filepath):
# 通过open_workbook函数 获取Book对象
wb = xlrd.open_workbook(filepath , on_demand=True)
# 创建一个新的sheet 对象
ws = wb.sheet_by_index(0)
# 创建2个空列表用于储存数据
workbook_list = []
my_keys = []
# 通过遍历ncols 获取excel表中第一行(python中0是第一行的意思)和所有列的数据
for col in range(ws.ncols):
my_keys.append(ws.cell_value(rowx=0, colx=col))
# 通过遍历nrows和 获取excel表中所有行里面的和对应列的数据
for r in range(1,ws.nrows):
dict = {}
for pos in range(0, len(my_keys)):
dict[my_keys[pos]] = ws.cell_value(rowx=r, colx=pos)
# 将获取的字典数据 添加进一开始写好的空列表中
workbook_list.append(dict)
return workbook_list
# 创建excel生成静态html页面的函数
def list_diction_to_html(list_work):
# 用dominate函数生成静态html页面
doc = dominate.document(title='excel-to-html')
# 写在头部的 css 可以自定义自己的想要用的css文件, (重要: meta一定要加 要不会在打开html时乱码,因为html默认不是utf-8编码)
with doc.head:
link(rel='stylesheet', href='page.css')
meta(charset='utf-8')
# 创建一个table,将获取到的数据通过遍历添加进去对应的位置
with doc:
with div(id='excel_table').add(table()):
with thead():
dict = list_work[0]
for key in dict.keys():
table_header = td()
table_header.add(p(key))
for dict2 in list_work:
table_row = tr(cls='excel_table_row')
for key in dict2:
with table_row.add(td()):
p(dict2[key])
return str(doc)
# 保存生成后html的函数
def save_dom_to_html(dom):
filepath = os.path.abspath("excel.html")
htmfile = open(filepath, "a+",encoding="utf-8")
htmfile.write(dom)
htmfile.close()
return filepath
def read_excel_to_html():
# dt = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
filepath = os.path.abspath(dest_filename)
list_work = excel_sheet_processor(filepath)
if list_work:
dom = list_diction_to_html(list_work)
save_dom_to_html(dom)
def print_time(task_id):
schedule.enter(5, 1, print_time, (task_id,))
print("#{} run time: {}".format(task_id, int(time.time())))
get_movies_to_excel()
read_excel_to_html()
schedule = sched.scheduler(time.time, time.sleep)
def start(inc=1):
#间隔时间、优先级(为两个被调度在相同时间执行的函数定序,数字越小,优先级越高)、被调用触发的函数、函数的参数(参数放在元组中,当只有一个参数时,写为(parm, ))
schedule.enter(1, 1, print_time, (inc,))
schedule.run()
if __name__ == '__main__':
start()