python--爬虫电影数据

# coding=utf-8
'''
获取豆瓣电影以及读书中的数据
'''

from selenium import webdriver
import time
import sys
from docutils.parsers.rst.directives import path

class Douban:
    def __init__(self):
        self.movie_url = 'https://movie.douban.com/nowplaying/shenzhen/'
        self.book_url = 'https://book.douban.com'

    def __enter__(self):
        self.dr = webdriver.PhantomJS('phantomjs')
        return self

    def __exit__(self):
        self.dr.quit()

    def get_current_movie(self):

        def by_rate(dic):
            return float(dic['rate'])

        self.dr.get(self.movie_url)
        self.dr.save_screenshot('douban.jpg')
        wrap_div = self.dr.find_element_by_id('nowplaying')
        cards = wrap_div.find_element_by_class_name('list-item')
        movies = []
        for card in cards:
            item = []
            item['name'] = card.find_element_by_css_selector('.stitle a').get_attribute(' title')
            item['rate'] = card.find_element_by_css_selector('.subject-rate').text
            if item['name'] and item['rate']:
                movies.append(item)
        return sorted(movies, key=by_rate, reverse=True)

    def get_hot_books(self):

        def by_rate(dic):
            return float(dic['rate'])

        self.dr.get(self.book_url)
        wrap_div = self.dr.find_element_by_css_selector('.section.popular-books')
        cards = wrap_div.find_element_by_tag_name('li')
        books = []
        for card in cards:
            item = []
            item['name'] = card.find_element_by_css_selector('h4.title').text
            item['rate'] = card.find_element_by_css_selector('.average-rating').text
            item['author'] = card.find_element_by_css_selector('p.author').text
            item['cagegory'] = card.find_element_by_css_selector('p.book-list-classification').text
            item['comment'] = card.find_element_by_css_selector('p.reviews').text
            if item['name'] and item['rate']:
                books.append(item)
        return sorted(books, key=by_rate, reverse=True)


class DoubanReporter:
    def __init__(self, path):
        self.reporter_path = path
        self.f = open(path, 'wb')

    def write_header(self):
        self.f.write('<html><head><meta charset="utf-8">')
        self.f.write('<link rel="stylesheet"  href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css">')
        self.f.write('<title>Douban Reporter </title></head>')

    def write_body(self):
        self.f.write('<body>')

    def finish_body(self):
        self.f.write('<body>')

    def append_image(self):
        image_name = './douban.jpg'
        self.f.write('<img src="%s" width="400px"></img>') % (image_name)

    def write_movie(self, movie_items):
        self.f.write('<h3>豆瓣正在热映</h3>')
        self.f.write('div style="width:400px')
        self.f.write('<ol>')
        for movie in movie_items:
            movie_item_html = '<li>%s<span style="float:right" class="label label-primary">%s</span></li>' % (
            movie['name'], movie['rate'])
            self.f.write(movie_item_html)
        self.f.write('<ol>')
        self.f.write('</div>')

    def finish_report(self):
        self.finish_body()
        self.f.write('<html>')
        self.f.close()

    def build_movie_report(self, movie_items):
        self.write_header()
        self.write_body()
        self.write_movie(movie_items)
        self.append_image()
        self.finish_report()


if __name__ == '__main__':
    with Douban() as douban:
        movies = douban.get_current_movies()
        reporter = DoubanReporter(movies)
        reporter.build_movie_report(movies)










  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值