AI实战第九课定时爬取豆瓣前250名的电影生成html_将爬取的电影名称写成html-CSDN博客

本文链接：https://blog.csdn.net/wzygis/article/details/113753399
#!/usr/bin/env python
# encoding=utf-8
import requests
import re
from bs4 import BeautifulSoup
from openpyxl import Workbook
import logging,dominate,os
from dominate.tags import *
import datetime
import xlrd
import sched
import time



wb = Workbook()
dest_filename = 'movies.xlsx'
ws1 = wb.active
ws1.title = "movies-top-250"

DOWNLOAD_URL = 'http://movie.douban.com/top250/'


def download_page(url):
    """获取url地址页面内容"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    data = requests.get(url, headers=headers).content
    return data


def get_li(doc):
    soup = BeautifulSoup(doc, 'html.parser')
    ol = soup.find('ol', class_='grid_view')
    name = []  # 名字
    star_con = []  # 评价人数
    score = []  # 评分
    info_list = []  # 短评
    for i in ol.find_all('li'):
        detail = i.find('div', attrs={'class': 'hd'})
        movie_name = detail.find(
            'span', attrs={'class': 'title'}).get_text()  # 电影名字
        level_star = i.find(
            'span', attrs={'class': 'rating_num'}).get_text()  # 评分
        star = i.find('div', attrs={'class': 'star'})
        star_num = star.find(text=re.compile('评价'))  # 评价

        info = i.find('span', attrs={'class': 'inq'})  # 短评
        if info:  # 判断是否有短评
            info_list.append(info.get_text())
        else:
            info_list.append('无')
        score.append(level_star)

        name.append(movie_name)
        star_con.append(star_num)
    page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
    if page:
        return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
    return name, star_con, score, info_list, None

# 创建excel生成静态html页面的函数
def list_diction_to_html(list_work):
    # 用dominate函数生成静态html页面
    dt = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    title = dt + '-excel-to-html'
    print(title)
    doc = dominate.document(title=title)
    # 写在头部的 css 可以自定义自己的想要用的css文件， （重要： meta一定要加 要不会在打开html时乱码，因为html默认不是utf-8编码）
    with doc.head:
        link(rel='stylesheet', href='https://cdn.staticfile.org/twitter-bootstrap/3.3.7/css/bootstrap.min.css')
        meta(charset='utf-8')
    # 创建一个table，将获取到的数据通过遍历添加进去对应的位置
    with doc:
        with div(id='excel_table').add(table()):
            with thead():
                dict = list_work[0]
                for key in dict.keys():
                    table_header = td()
                    table_header.add(p(key))
            for dict2 in list_work:
                table_row = tr(cls='excel_table_row')
                for key in dict2:
                    with table_row.add(td()):
                        p(dict2[key])
    return str(doc)
# 保存生成后html的函数
def save_dom_to_html(dom):
    filepath = os.path.abspath("index.html")
    htmfile = open(filepath, "w")
    htmfile.write(dom)
    htmfile.close()
    return filepath

def get_movies_to_excel():
    url = DOWNLOAD_URL
    name = []
    star_con = []
    score = []
    info = []
    while url:
        doc = download_page(url)
        print(url)
        print(doc)
        movie, star, level_num, info_list, url = get_li(doc)
        name = name + movie
        star_con = star_con + star
        score = score + level_num
        info = info + info_list
    # print(info)
    for (i, m, o, p) in zip(name, star_con, score, info):
        col_A = 'A%s' % (name.index(i) + 1)
        col_B = 'B%s' % (name.index(i) + 1)
        col_C = 'C%s' % (name.index(i) + 1)
        col_D = 'D%s' % (name.index(i) + 1)
        ws1[col_A] = i
        ws1[col_B] = m
        ws1[col_C] = o
        ws1[col_D] = p
    wb.save(filename=  dest_filename )


    # 创建获取excel数据的函数
def excel_sheet_processor(filepath):
    # 通过open_workbook函数 获取Book对象
    wb = xlrd.open_workbook(filepath , on_demand=True)
    # 创建一个新的sheet 对象
    ws = wb.sheet_by_index(0)
    # 创建2个空列表用于储存数据
    workbook_list = []
    my_keys = []
    # 通过遍历ncols 获取excel表中第一行（python中0是第一行的意思）和所有列的数据
    for col in range(ws.ncols):
        my_keys.append(ws.cell_value(rowx=0, colx=col))

    # 通过遍历nrows和 获取excel表中所有行里面的和对应列的数据
    for r in range(1,ws.nrows):
        dict = {}
        for pos in range(0, len(my_keys)):
            dict[my_keys[pos]] = ws.cell_value(rowx=r, colx=pos)
        # 将获取的字典数据  添加进一开始写好的空列表中
        workbook_list.append(dict)
    return workbook_list


# 创建excel生成静态html页面的函数
def list_diction_to_html(list_work):
    # 用dominate函数生成静态html页面
    doc = dominate.document(title='excel-to-html')
    # 写在头部的 css 可以自定义自己的想要用的css文件， （重要： meta一定要加 要不会在打开html时乱码，因为html默认不是utf-8编码）
    with doc.head:
        link(rel='stylesheet', href='page.css')
        meta(charset='utf-8')
    # 创建一个table，将获取到的数据通过遍历添加进去对应的位置
    with doc:
        with div(id='excel_table').add(table()):
            with thead():
                dict = list_work[0]
                for key in dict.keys():
                    table_header = td()
                    table_header.add(p(key))
            for dict2 in list_work:
                table_row = tr(cls='excel_table_row')
                for key in dict2:
                    with table_row.add(td()):
                        p(dict2[key])
    return str(doc)

# 保存生成后html的函数
def save_dom_to_html(dom):
    filepath = os.path.abspath("excel.html")
    htmfile = open(filepath, "a+",encoding="utf-8")
    htmfile.write(dom)
    htmfile.close()
    return filepath

def read_excel_to_html():
    # dt = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    filepath = os.path.abspath(dest_filename)
    list_work = excel_sheet_processor(filepath)
    if list_work:
        dom = list_diction_to_html(list_work)
        save_dom_to_html(dom)


def print_time(task_id):
    schedule.enter(5, 1, print_time, (task_id,))
    print("#{} run time: {}".format(task_id, int(time.time())))
    get_movies_to_excel()
    read_excel_to_html()

schedule = sched.scheduler(time.time, time.sleep)
def start(inc=1):
    #间隔时间、优先级（为两个被调度在相同时间执行的函数定序，数字越小，优先级越高）、被调用触发的函数、函数的参数（参数放在元组中，当只有一个参数时，写为(parm, )）
    schedule.enter(1, 1, print_time, (inc,))
    schedule.run()

if __name__ == '__main__':
    start()