BeautifulSoup爬取数据推荐操作简单,但是相比于xpath和正则匹配数据太慢,不建议使用
案例一: 双色球历史数据爬虫,爬取期数,开奖时间,红色球,蓝色球,一等奖,二等奖等信息
from urllib import request,parse
from urllib import error
import chardet
from lxml import etree
import csv
import re
from bs4 import BeautifulSoup
def shuangseqiu(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
req = request.Request(url, headers=headers)
try:
response = request.urlopen(req)
resHtml = response.read()
resHtml = resHtml.decode("utf-8", 'ignore')
soup = BeautifulSoup(resHtml, 'lxml')
for i in soup.select('tr')[2:]:
# 期号的提取
qihao = i.select('td')[0].get_text()
print('------------------------------')
# 开奖日期的提取
date = i.select('td')[1].get_text()
# 总投注额(元)的提取
totalprice = i.select('td')[9].get_text()
# 一等奖(注数)的提取
OneNumber = i.select('td')[10].get_text()
# 一等奖(奖金(元))的提取
OnePrice = i.select('td')[11].get_text()
# 二等奖(注数)的提取
TwoNumber = i.select('td')[12].get_text()
# 二等奖(奖金(元))的提取
TwoPrice = i.select('td')[13].get_text()
# 奖池滚存(元)的提取
jackpot = i.select('td')[14].get_text()
# 红色球
red_qiu = i.select('td')[2].get_text() + i.select('td')[3].get_text() + i.select('td')[4].get_text() + i.select('td')[5].get_text() + i.select('td')[6].get_text() + i.select('td')[7].get_text()
# 蓝色球
blueqiu = i.select('td')[8].get_text()
except error.URLError as e:
print(e)
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
url = "http://zst.aicai.com/ssq/openInfo/"
shuangseqiu(url)
案例二: 爬取空姐网相册中照片
"""
需求:爬取空姐网相册中照片,保存在存储在images目录下,文件命名:uid + picid +'.jpg'
http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1
"""
from urllib import request,parse
from urllib import error
import chardet
import csv
import re,string
from bs4 import BeautifulSoup
def kongjie(url, beginPage, endPage):
"""
作用:负责处理url,分配每个url去发送请求
url:需要处理的第一个url
beginPage: 爬虫执行的起始页面
endPage: 爬虫执行的截止页面
"""
for page in range(beginPage, endPage):
pn = page
# 组合为完整的 url,并且pn值每次增加50
fullurl = url + "&page=" + str(pn)
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
req = request.Request(fullurl, headers=headers)
response = request.urlopen(req)
html = response.read()
resHtml = html.decode("utf-8", 'ignore')
soup = BeautifulSoup(resHtml, 'lxml')
a = 0
for m in soup.select('div[class="c"]'):
url = m.a.get('href')
imgresponse = request.urlopen(url)
images = imgresponse.read()
resHtml = images.decode("utf-8", 'ignore')
soup = BeautifulSoup(resHtml, 'lxml')
pg = soup.find_all('label')
if pg !=[]:
for i in pg:
a = i.select('span')
if a !=[]:
for b in a:
ss = b.get('title')
num = re.findall("\d+", ss)[0]
for pn in range(0,int(num)):
kjurl = url+'&page='+ str(pn)
print(kjurl)
kjreq = request.Request(kjurl, headers=headers)
response = request.urlopen(kjreq)
html = response.read()
resHtml = html.decode("utf-8", 'ignore')
soup = BeautifulSoup(resHtml, 'lxml')
for i in soup.select('ul[class="ptw ml mlp cl"] li'):
url = i.a.get('href')
imgresponse = request.urlopen(url)
images = imgresponse.read()
resHtml = images.decode("utf-8", 'ignore')
soup = BeautifulSoup(resHtml, 'lxml')
for i in soup.select('div[id="photo_pic"] a'):
url = i.img.get('src')
print(url)
imageresponse = request.urlopen(url)
listimages = imageresponse.read()
a += 1
with open('./images/美女_%s.jpg' % (a), 'wb') as file:
file.write(listimages)
else:
for i in soup.select('ul[class="ptw ml mlp cl"] li'):
url = i.a.get('href')
imgresponse = request.urlopen(url)
images = imgresponse.read()
resHtml = images.decode("utf-8", 'ignore')
soup = BeautifulSoup(resHtml, 'lxml')
for i in soup.select('div[id="photo_pic"] a'):
url = i.img.get('src')
print(url)
imgresponse = request.urlopen(url)
images = imgresponse.read()
a += 1
with open('./images/空姐_%s.jpg' % (a), 'wb') as file:
file.write(images)
# 模拟 main 函数
if __name__ == "__main__":
proxy = {"http": "218.22.102.107:80"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
# 输入起始页和终止页,str转成int类型
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入终止页:"))
url = "http://www.kongjie.com/home.php?mod=space&do=album&view=all"
# 组合后的url示例:http://tieba.baidu.com/f?kw=lol
url = parse.quote(url, safe=string.printable)
kongjie(url, beginPage, endPage)