爬取豆瓣所有电影

#!/usr/bin/python
#encoding:utf-8
#!/usr/bin/env python
# coding:utf8


'''
   一次性爬取豆瓣所有电影的概要信息
'''


import urllib2
import urllib
import json
import time


ISOTIMEFORMAT='%Y-%m-%d %X'


outputFile = 'douban_movie.txt'
fw = open(outputFile, 'w')
fw.write('id;title;url;cover;rate\n')


headers = {}
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
headers["Accept-Encoding"] = "gzip, deflate, sdch"
headers["Accept-Language"] = "zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4,ja;q=0.2"
# headers["Cache-Control"] = "max-age=0"
headers["Connection"] = "keep-alive"
# headers["Cookie"] = 'bid="LJSWKkSUfZE"; ll="108296"; __utmt=1; regpop=1; _pk_id.100001.4cf6=32aff4d8271b3f15.1442223906.2.1442237186.1442224653.; _pk_ses.100001.4cf6=*; __utmt_douban=1; __utma=223695111.736177897.1442223906.1442223906.1442236473.2; __utmb=223695111.0.10.1442236473; __utmc=223695111; __utmz=223695111.1442223906.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=30149280.674845100.1442223906.1442236473.1442236830.3; __utmb=30149280.4.9.1442237186215; __utmc=30149280; __utmz=30149280.1442236830.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ap=1'
headers["Host"] = "movie.douban.com"
headers["Referer"] = "http://movie.douban.com/"
headers["Upgrade-Insecure-Requests"] = 1
headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"


# 获取tag
request = urllib2.Request(url="http://movie.douban.com/j/search_tags?type=movie")
response = urllib2.urlopen(request)
tags = json.loads(response.read())['tags']


# 开始爬取
print "********** START **********"
print time.strftime( ISOTIMEFORMAT, time.localtime() )


for tag in tags:
   print "Crawl movies with tag: " + tag
   print time.strftime( ISOTIMEFORMAT, time.localtime() )


   start = 0
   while True:
      url = "http://movie.douban.com/j/search_subjects?type=movie&tag=" + tag.encode('utf8') + "&page_limit=20&page_start=" + str(start)
      request = urllib2.Request(url=url)
      response = urllib2.urlopen(request)
      movies = json.loads(response.read())['subjects']
      if len(movies) == 0:
         break
      for item in movies:
         rate = item['rate']
         title = item['title']
         url = item['url']
         cover = item['cover']
         movieId = item['id']
         record = str(movieId) + ';' + title + ';' + url + ';' + cover + ';' + str(rate) + '\n'
         fw.write(record.encode('utf8'))
         print tag + '\t' + title
      start = start + 20


fw.close()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
爬取豆瓣电影数据的一般流程为: 1. 发送 HTTP 请求获取网页内容 2. 解析网页内容,提取需要的数据 3. 存储数据 具体实现方法如下: 1. 安装 requests 和 BeautifulSoup 库 ``` pip install requests pip install beautifulsoup4 ``` 2. 发送 HTTP 请求并获取网页内容 ```python import requests url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) html_content = response.text ``` 3. 解析网页内容并提取需要的数据 ```python from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') items = soup.find_all('div', class_='item') movie_list = [] for item in items: # 提取电影名称和评分等信息 title = item.find('span', class_='title').text rating_num = item.find('span', class_='rating_num').text movie_list.append({'title': title, 'rating_num': rating_num}) ``` 4. 存储数据 ```python import pandas as pd df = pd.DataFrame(movie_list) df.to_csv('douban_movie_top250.csv', index=False) ``` 完整代码如下: ```python import requests from bs4 import BeautifulSoup import pandas as pd url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) html_content = response.text soup = BeautifulSoup(html_content, 'html.parser') items = soup.find_all('div', class_='item') movie_list = [] for item in items: # 提取电影名称和评分等信息 title = item.find('span', class_='title').text rating_num = item.find('span', class_='rating_num').text movie_list.append({'title': title, 'rating_num': rating_num}) df = pd.DataFrame(movie_list) df.to_csv('douban_movie_top250.csv', index=False) ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值