第二节练习项目:爬取商品信息
from bs4 import BeautifulSoup
import re # 引入正则表达式
# 本地静态网页路径
path = './index.html'
with open(path,'r') as wb_file:
wb_content=wb_file.read()
soup = BeautifulSoup(wb_content,'lxml')
pics=soup.select('body > div > div > div.col-md-9 > div > div > div > img')
titles=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
prices=soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
stars=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
reviews=soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
for pic,title,price,star,review in zip(pics,titles,prices,stars,reviews):
data={
'pic':pic.get('src')
,'title':title.get_text()
,'price' :price.get_text()
, 'star': len(star.find_all('span','glyphicon-star'))
# 正则表达式解析数字
, 'review': int(re.search(r'(\d+)\s*.*',review.get_text()).group(1))
}
print(data)
屏幕快照 2016-05-28 下午4.25.32.png
学习了如何获取数组长度
了解基础的正则表达式知识
第三节练习项目:爬取租房信息
from bs4 import BeautifulSoup
import requests
import time
import re
def get_room_info(soup,data=None):
titles = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
addresses = soup.select('body > div.wrap.clearfix.con_bg > d