如下图,我们要爬取标题和价钱和区域等
1、导入库
from bs4 import BeautifulSoup
import requests
import time
2、抓取整个HTML网页(增加判断语句,防止解析404页面)
wb_data = requests.get('https://bj.58.com/jiadian/29063883256526x.shtml')
if wb_data.status_code == 404:
pass
else:
soup = BeautifulSoup(wb_data.text, 'lxml')
3、解析抓取的内容,并定位自己想要的内容
data = {
'title':soup.title.text.strip(),
#basicinfo > div.infocard__container.haveswitch > div > div.infocard__container__item__main > span
#'price':soup.select(' span.infocard__container__item__main__text--price '),
'price': soup.select(' span.infocard__container__item__main__text--price ')[0].text.strip(),
#'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0],
'area':soup.select('div.infocard__container__item__main > a')[0].text.strip(),
#'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings),
'url':url
}
print(data)