python爬虫，豆瓣电影top250

最新推荐文章于 2023-08-21 08:00:00 发布

cpongo11

最新推荐文章于 2023-08-21 08:00:00 发布

阅读量462

点赞数

企鹅我认为v

本文链接：https://blog.csdn.net/cpongo11/article/details/101027478

版权

2015年9月16日 No comments Article

< !DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/strict.dtd">

# encoding=utf-8
import sys
reload ( sys )
sys . setdefaultencoding ( ‘utf8′ )
import urllib
from bs4 import BeautifulSoup

debug = True # 设置是否打印log

def log ( message ):
if debug :
print message

def download_image ( url , save_path ):
”’ 根据图片url下载图片到save_path ”’
try :
urllib . urlretrieve ( url , save_path )
log ( ‘Downloaded a image: ‘ + save_path )
except Exception , e :
print ‘An error catched when download a image:’ , e

def load_page_html ( url ):
”’ 得到页面的HTML文本 ”’
log ( ‘Get a html page : ‘ + url )
return urllib . urlopen ( url ). read ()

def down_page_images ( page , save_dir ):
”’ 下载第page页的图片 ”’
html_context = load_page_html ( ‘http://movie.douban.com/top250?start=%d&filter=&type=’ % page )
soup = BeautifulSoup ( html_context , “html.parser” )
for ui_module_div in soup . findAll ( ‘div’ , { ‘class’ : ‘item’ }):
img_tag = ui_module_div . find ( ‘img’ )
if img_tag is not None and img_tag . has_attr ( ‘alt’ ) and img_tag . has_attr ( ‘src’ ):
alt = img_tag . attrs [ 'alt' ] # 图片的介绍
src = img_tag . attrs [ 'src' ] # 图片的地址
filename = ‘%s%s’ % ( alt , src [- 4 :]) # 取后四位（有的图片后缀是’.jpg’而有的是’.gif’）
download_image ( src , save_dir + filename )
page = page + 1
file_obj = open ( ‘top250.txt’ , ‘a’ )
file_obj . write (` page `+ “.” + alt + “n” )
file_obj . close ()

def download_qbcr ( frm = 0 , save_dir = ‘./’ ):
for x in xrange ( frm , 250 , 25 ):
log ( ‘Page : ‘ + ` x `)
down_page_images ( x , save_dir )

def main ():
base_path = ‘C:\Users\jingle\Desktop\pachong\’
download_qbcr ( frm = 0 , save_dir = base_path )

if __name__ == ‘__main__’ :
main ()

Categories: Python, 爬虫

发表评论取消回复

电子邮件地址不会被公开。必填项已用*标注

姓名 *

电子邮件 *

站点

您可以使用这些HTML标签和属性： <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>