find-all
from bs4 import BeautifulSoup
data=[]
path = r'D:\BaiduYunDownload\Python实战::四周实现爬虫系统\课程资料\课程源码及作业参考答案\week1\1_2\1_2code_of_video\web\new_index.html'
with open(path, 'r') as f:
Soup = BeautifulSoup(f.read(), 'lxml')
titles = Soup.select('body > div.main-content > ul > li > div.article-info > h3 > a')
images = Soup.select('body > div.main-content > ul > li > img')
descs = Soup.select('body > div.main-content > ul > li > div.article-info > p.description')
cates = Soup.select('body > div.main-content > ul > li > div.article-info > p.meta-info')
rates = Soup.select('body > div.main-content > ul > li > div.rate > span')
for title, image, desc, cate, rate in zip(titles, images, descs, cates, rates):
info = {
'title':title.get_text(),
'image':image.get('src'),
'desc':desc.get_text(),
'cate':list(cate.stripped_strings),
'rate':rate.get_text()
}
data.append(info)
for i in data:
if float(i['rate']) >= 4:
print(i['title'], i['rate'])
from bs4 import BeautifulSoup
path = './1_2_homework_required/index.html'
with open(path, 'r') as wb_data:
Soup = BeautifulSoup(wb_data, 'lxml')
titles = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
images = Soup.select('body > div > div > div.col-md-9 > div > div > div > img')
reviews = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
prices = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
stars = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
for title, image, review, price, star in zip(titles, images, reviews, prices, stars):
data = {
'title': title.get_text(),
'image': image.get('src'),
'review': review.get_text(),
'price': price.get_text(),
'star': len(star.find_all("span", class_='glyphicon glyphicon-star'))
}
print(data)
from bs4 import BeautifulSoup
import string
with open(r'D:\BaiduYunDownload\Python实战::四周实现爬虫系统\课程资料\课程源码及作业参考答案\week1\1_2\1_2answer_of_homework\1_2_homework_required\index.html', 'r') as web_data:
soup = BeautifulSoup(web_data, 'lxml')
titles = soup.select(
'body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a ')
images = soup.select(
'body > div > div > div.col-md-9 > div > div > div > img')
reviews = soup.select(
'body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
prices = soup.select(
'body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
grades_crawler = soup.select(
'body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2) > span ')
grades = []
while len(grades_crawler) != 0:
e = grades_crawler[0:5]
grades.insert(1, e)
del grades_crawler[0:5]
for title, image, review, price, grade in zip(titles, images, reviews, prices, grades):
star = []
b = str(grade)
c = b.replace('<span class="glyphicon glyphicon-star"></span>', '★')
d = c.replace('<span class="glyphicon glyphicon-star-empty"></span>', '☆')
star.append(d)
data = {
'title': title.get_text(),
'image': image.get('src'),
'review': review.get_text(),
'price': price.get_text(),
'grade': ''.join(star).replace('[', '').replace(']', '').replace(',', '').replace(' ', '')
}
print(data)