如何分页爬取数据--beautisoup

'''本次爬取讲历史网站'''
#!usr/bin/env python

#-*- coding:utf-8 _*-
"""
@author:Hurrican
@file: 分页爬取数据.py
@time: 2018/11/03 9:30

"""
from bs4 import BeautifulSoup
import requests

def get_urls():
urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)]
urls.insert(0, 'http://www.jianglishi.cn/jiemi/')
return urls

def get_title():
for a1 in get_urls():
web_data = requests.get(a1)

web_data.encoding = 'utf-8'
web = web_data.text
soup = BeautifulSoup(web, 'html5lib')
soup1 = soup.findAll(name='div', attrs={'class': 'title'})
for piece in soup1:
title = piece.a.string
print(title)

if __name__ == '__main__':
get_title()
运行结果:

方法2:
#!usr/bin/env python
#-*- coding:utf-8 _*-
"""
@author:lenovo
@file: spider_urllib.py
@time: 2018/11/07 14:31

"""
import urllib.request
from bs4 import BeautifulSoup
'''
python3中unicode 与 bytes 相互转化 str类型转化为bytes类型,使用encode()内置函数;反过来,使用decode()函数
'''
def get_content():

urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)]
urls.insert(0, 'http://www.jianglishi.cn/jiemi/')
for url in urls:
html = urllib.request.urlopen(url)
content = html.read()
content = content.decode()
html.close()

osoup = BeautifulSoup(content,'html5lib')
all_title = osoup.find_all('div',class_="title")
# print(all_images)
for title in all_title:
print(title.a.string)

get_content()

衍生阅读:

我们来继续爬取图片

#!usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author:lenovo
@file: spider_urllib.py
@time: 2018/11/07 14:31

"""
import urllib.request
from bs4 import BeautifulSoup

'''
python3中unicode 与 bytes 相互转化 str类型转化为bytes类型,使用encode()内置函数;反过来,使用decode()函数
'''
def get_urls():
urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)]
urls.insert(0, 'http://www.jianglishi.cn/jiemi/')
return urls

def get_content():

# urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)]
# urls.insert(0, 'http://www.jianglishi.cn/jiemi/')
x= 1
for url in get_urls():
html = urllib.request.urlopen(url)
content = html.read()
content = content.decode('utf-8')
html.close()

osoup = BeautifulSoup(content,'html5lib')
all_images = osoup.find_all('img',οnerrοr="this.src='/statics/statics/img/nopic.gif';this.οnerrοr=null;")
print(all_images)

for img in all_images:
#print(img['src'])
dow_img = img['src'].encode('utf-8').decode('utf-8')
g = dow_img.replace('。','.') #此处我们发现有一个链接出现中文的句号,需要将其处理成标准的url
# print(g)
s = urllib.request.urlretrieve(g,r'H:\py\image\\%s.jpg'%x)
print("正在下载%s"%dow_img)
x+=1
print("下载完成")


get_content()
'''
<img src="http://cimg2。163.com/cnews/2006/9/25/20060925163612ab80e.jpg" alt="三峡蓄水奉节老城全淹 
“刘备疑冢”永沉长江底" οnerrοr="this.src='/statics/statics/img/nopic.gif';this.οnerrοr=null;">
'''
上面这种情况直接是要用try catch  就能避免

转载于:https://www.cnblogs.com/wujf-myblog/p/9906858.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值