python--爬虫电影数据

最新推荐文章于 2024-03-20 03:59:21 发布

echodouble

最新推荐文章于 2024-03-20 03:59:21 发布

阅读量590

点赞数

分类专栏：数据处理文章标签：爬虫

数据处理专栏收录该内容

13 篇文章 0 订阅

订阅专栏

# coding=utf-8
'''
获取豆瓣电影以及读书中的数据
'''

from selenium import webdriver
import time
import sys
from docutils.parsers.rst.directives import path

class Douban:
    def __init__(self):
        self.movie_url = 'https://movie.douban.com/nowplaying/shenzhen/'
        self.book_url = 'https://book.douban.com'

    def __enter__(self):
        self.dr = webdriver.PhantomJS('phantomjs')
        return self

    def __exit__(self):
        self.dr.quit()

    def get_current_movie(self):

        def by_rate(dic):
            return float(dic['rate'])

        self.dr.get(self.movie_url)
        self.dr.save_screenshot('douban.jpg')
        wrap_div = self.dr.find_element_by_id('nowplaying')
        cards = wrap_div.find_element_by_class_name('list-item')
        movies = []
        for card in cards:
            item = []
            item['name'] = card.find_element_by_css_selector('.stitle a').get_attribute(' title')
            item['rate'] = card.find_element_by_css_selector('.subject-rate').text
            if item['name'] and item['rate']:
                movies.append(item)
        return sorted(movies, key=by_rate, reverse=True)

    def get_hot_books(self):

        def by_rate(dic):
            return float(dic['rate'])

        self.dr.get(self.book_url)
        wrap_div = self.dr.find_element_by_css_selector('.section.popular-books')
        cards = wrap_div.find_element_by_tag_name('li')
        books = []
        for card in cards:
            item = []
            item['name'] = card.find_element_by_css_selector('h4.title').text
            item['rate'] = card.find_element_by_css_selector('.average-rating').text
            item['author'] = card.find_element_by_css_selector('p.author').text
            item['cagegory'] = card.find_element_by_css_selector('p.book-list-classification').text
            item['comment'] = card.find_element_by_css_selector('p.reviews').text
            if item['name'] and item['rate']:
                books.append(item)
        return sorted(books, key=by_rate, reverse=True)


class DoubanReporter:
    def __init__(self, path):
        self.reporter_path = path
        self.f = open(path, 'wb')

    def write_header(self):
        self.f.write('<html><head><meta charset="utf-8">')
        self.f.write('<link rel="stylesheet"  href="http://cdn.bootcss.com/bootstrap/3.3.6/css/bootstrap.min.css">')
        self.f.write('<title>Douban Reporter </title></head>')

    def write_body(self):
        self.f.write('<body>')

    def finish_body(self):
        self.f.write('<body>')

    def append_image(self):
        image_name = './douban.jpg'
        self.f.write('<img src="%s" width="400px"></img>') % (image_name)

    def write_movie(self, movie_items):
        self.f.write('<h3>豆瓣正在热映</h3>')
        self.f.write('div style="width:400px')
        self.f.write('<ol>')
        for movie in movie_items:
            movie_item_html = '<li>%s<span style="float:right" class="label label-primary">%s</span></li>' % (
            movie['name'], movie['rate'])
            self.f.write(movie_item_html)
        self.f.write('<ol>')
        self.f.write('</div>')

    def finish_report(self):
        self.finish_body()
        self.f.write('<html>')
        self.f.close()

    def build_movie_report(self, movie_items):
        self.write_header()
        self.write_body()
        self.write_movie(movie_items)
        self.append_image()
        self.finish_report()


if __name__ == '__main__':
    with Douban() as douban:
        movies = douban.get_current_movies()
        reporter = DoubanReporter(movies)
        reporter.build_movie_report(movies)

echodouble

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python--爬虫电影数据

# coding=utf-8'''获取豆瓣电影以及读书中的数据'''from selenium import webdriverimport timeimport sysfrom docutils.parsers.rst.directives import pathclass Douban: def __init__(self): self.movie_...
复制链接

扫一扫