python编写爬虫遇到的问题_【python小白】写爬虫代码遇到问关于异常处理的问题...

代码如下:

# -*- coding:utf-8 -*-

from urllib.request import urlopen

import bs4

import webbrowser

import requests

html_list = [] #存放要爬的所有网页

html_list_txt = [] #存放要爬的所有网页的源代码

movie_list = [] #存放所有电影

#movie_total = {} #存放所有电影资料的字典

#def add_movie():

html = 'https://movie.douban.com/top250'

html_list.append(html)

#html2 = requests.get('https://movie.douban.com/top250')

#webbrowser.open(html)

#webbrowser.open(html)

html_txt = (urlopen(html)).read()

#html_txt = (requests.get(html)).text

bsObj = bs4.BeautifulSoup(html_txt, 'html.parser')

print('---1---')

html_div = bsObj.find('div',{'class':'paginator'})

print('---2---')

html_a = html_div.findAll('a')

print('---3---')

for html_a_temp in html_a:

# print(type(html_a_temp))

#i = 2

#while i <= 9:

#print(html_a_temp.get_text())

#if html_a_temp.get_text != '后页>':

html_href = html_a_temp.attrs['href']

html_href = 'https://movie.douban.com/top250' + html_href

html_list.append(html_href)

#i += 1

print('---4---')

html_list = list(set(html_list))

print(len(html_list))

#print(html_set)

#print(len(html_set))

#print(set(html_list))

#print(type('后页'))

#上面的代码为找到所有的链接

for html_list_temp in html_list:

'''把所有要爬的链接全部转换成源代码并存储'''

html_read = bs4.BeautifulSoup(urlopen(html_list_temp).read(), 'html.parser')

html_list_txt.append(html_read)

for html_page in html_list_txt:

name_div_list = html_page.findAll('div',{'class':'info'})

for name_div_temp in name_div_list:

movie_total = {} #创建一个局部变量的字典,用来存放临时找到的某个电影的资料

name_div_inside = name_div_temp.findAll('div')

movie_name = name_div_inside[0].a.span.get_text() #找到电影名字

name_div_star = name_div_temp.find('div',{'class':'star'})

name_div_star_span = name_div_star.findAll('span')

movie_score = name_div_star_span[1].get_text() #找到电影评分

movie_number = name_div_star_span[3].get_text() #找到电影评价人数

# ------分隔符-----

try:

movie_introduction = name_div_temp.find('span',{'class':'inq'}).get_text()#找到电影简评

except AttributeError:

print("这部电影没有简评~~~~~~~~~~~")

print(movie_introduction)

#name_span_inq = name_div_temp.findAll('p')[1].span.get_text()

#movie_introduction = name_span_inq

#name_span_inq = name_div_temp.find('span',{'class':'inq'})

#movie_introduction = name_span_inq.get_text() #找到电影简评

movie_total['name'] = movie_name

movie_total['score'] = movie_score

movie_total['number'] = movie_number

movie_total['introduction'] = movie_introduction

movie_list.append(movie_total)

print(movie_list)

'''

name_div_inside_span_list = name_div_inside[1].div.findAll('span')

for name_div_inside_span_temp in name_div_inside_span_list:

movie_score = name_div_inside[1].div.span[1].get_text() #找到电影评分

movie_number = name_div_inside[1].div.span[3].get_text() #找到评价人数

'''

#movie_total[name] = movie_name

'''

name_div_list = html_page.findAll('div',{'class':'hd'})

for name_div_temp in name_div_list:

movie_name = name_div_temp.a.span.get_text()

movie_name_list.append(movie_name)

'''

#print(movie_name_list)

运行结果如下:

c15ae9656dddf4577be306f8b2442805.png

蓝色箭头位置,抛出了异常处理,但是紧接着,前一部电影的简评又被打印了一次。这是怎么回事呢?

bbce172039e9e42157b79bf67a468b5f.png

78a80a7d86d86a4b4eda7d66c504583f.png

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值