实例1 中国大学排名定向爬虫
需求分析:
1 爬取url:http://www.zuihaodaxue.cn/shengyuanzhiliangpaiming2017.html
2 获取大学的排名信息,‘排名’,‘学校名称’,‘成绩’
概要设计:
1 获取网页内容 getHTMLText()
2 解析网页内容并保存信息 fillUnivList()
3 输出信息 printUnivList()
详细设计:
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, "html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr, bs4.element.Tag):
tds = tr('td')
ulist.append([tds[0].string, tds[1].string, tds[3].string])
def printUnivList(ulist, num):
# 格式化输出
tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("排名", "学校名称", "总分", chr(12288)))
for i in range(num):
u = ulist[i]
print(tplt.format(u[0], u[1], u[2], chr(12288)))
print("Suc" + str(num))
if __name__=="__main__":
uinfo = []
url = "http://www.zuihaodaxue.cn/shengyuanzhiliangpaiming2017.html"
html = getHTMLText(url)
fillUnivList(uinfo, html)
printUnivList(uinfo, 20)
输出结果
实例2 淘宝商品信息定向爬虫
需求分析:
1 获取淘宝搜索页面的信息,提取其中的商品名称和价格
2 翻页处理
概要设计:
1 提交商品搜索请求,循环获取页面
2 对于每个页面,提取商品名称和价格信息
3 将信息输出
详细设计:
import requests
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parserPage(ilt, html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html) # 获取商品价格
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html) # 获取商品名称
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
def main():
goods = '书包'
depth = 2
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44*i)
html = getHTMLText(url)
parserPage(infoList, html)
except:
continue
printGoodsList(infoList)
main()