python实例豆瓣代码_Python爬豆瓣电影实例

文件结构

20182231353001.png

html_downloader.py - 下载网页html内容

#!/usr/bin/python

# -*- coding: UTF-8 -*-

import urllib2

class HtmlDownloader(object):

def downlod(self, url):

if url is None:

return None

response = urllib2.urlopen(url)

if response.getcode() != 200:

return None

return response.read()

html_outputer.py - 输出结果到文件中

#!/usr/bin/python

# -*- coding: UTF-8 -*-

class HtmlOutputer(object):

def collect_data(self, movie_data):

if movie_data is None:

return

fout = open('output.html', 'a+')

for data in movie_data:

print data['name'] + '|', data['rate'] + '|', data['actor'], '\n'

fout.write('%s,' % data['name'].encode('utf-8'))

fout.write('%s,' % data['rate'])

fout.write('%s\n' % data['actor'].encode('utf-8'))

fout.close()

html_parser.py: 解析器:解析html的dom树

#!/usr/bin/python

# -*- coding: UTF-8 -*-

from bs4 import BeautifulSoup

class HtmlParser(object):

def __init__(self):

pass

def parser_html(self, cnt):

if cnt is None:

return

soup = BeautifulSoup(cnt, 'html.parser', from_encoding='utf-8')

# movie_name, movie_desc, movie_rate =

return self.get_movie_names(soup)

def get_movie_names(self, soup):

movie_data = []

movie_all = soup.find('div', class_='article').find_next('table').find_next_sibling('div').find_next_sibling('div').find_all('table')

count = 1

for movie_one in movie_all:

movie_data.append(self.get_movie_name(movie_one))

# if count > 2:

# break

count += 1

return movie_data

def get_movie_name(self, cnt):

info = {}

soup = BeautifulSoup(str(cnt), 'html.parser', from_encoding='utf-8')

movie_one = soup.find('tr', class_='item').find_next('td').find_next_sibling('td').find('div', class_='pl2')

info['name'] = movie_one.find('a').get_text().replace("\n", "").replace(" ", "")

info['actor'] = movie_one.find('p', class_='pl').get_text().replace("\n", "").replace(" ", "")

info['rate'] = movie_one.find('div', class_='star clearfix').find('span', class_='rating_nums').get_text()

return info

spider_main.py - 主函数

#!/usr/bin/python

# -*- coding: UTF-8 -*-

import html_parser, html_outputer, html_downloader

class SpiderMain(object):

def __init__(self):

self.parser = html_parser.HtmlParser()

self.outputer = html_outputer.HtmlOutputer()

self.downloader = html_downloader.HtmlDownloader()

def craw(self, url):

html_cnt = self.downloader.downlod(url)

movie_data = self.parser.parser_html(html_cnt)

self.outputer.collect_data(movie_data)

if __name__ == '__main__':

url = 'https://movie.douban.com/tag/2017?start=100&type=T'

spider = SpiderMain()

spider.craw(url)

综述

其实就是使用了urllib2和BeautifulSoup库,没啥好说的,你也可以直接改url,然后更改html_parser.py文件来满足你自己的爬虫需求。当前也可以更改html_outputer.py来定义保存格式,目前是csv。

好的,以下是Python豆瓣音乐TOP250数据的代码实例: ```python import requests from bs4 import BeautifulSoup url = 'https://music.douban.com/top250' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} def get_music_info(url): res = requests.get(url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') music_list = soup.find('div', class_='article').find_all('tr', class_=lambda x:x and x.startswith('item')) for music in music_list: rank = music.find('td', class_='').get_text().strip() music_name = music.find('div', class_='pl2').find('a').get_text().strip() music_info = music.find('div', class_='pl2').find('p').get_text().strip().split('/') music_artist = music_info[0].strip() music_album = music_info[1].strip() music_year = music_info[2].strip() if len(music_info) == 4 else '' music_rating = music.find('span', class_='rating_nums').get_text().strip() music_votes = music.find('span', class_='pl').get_text().strip() print("排名:{}\n歌曲名:{}\n歌手:{}\n专辑名:{}\n发行年份:{}\n评分:{}\n评论数:{}\n".format(rank, music_name, music_artist, music_album, music_year, music_rating, music_votes)) for i in range(0, 250, 25): get_music_info(url + '?start=' + str(i)) ``` 这个程序使用了requests和BeautifulSoup库来豆瓣音乐TOP250的数据。程序首先定义了请求的URL和请求头部,然后使用requests库来获取网页的HTML文本内容,并使用BeautifulSoup库来解析HTML文本内容。 程序中的`get_music_info`函数用来获取每个音乐的详细信息,并将信息打印出来。在主程序中,我们将URL的`start`参数从0循环到225,每次调用`get_music_info`函数来获取一页的数据。 希望这个实例能够帮到你!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值