import requests #网页抓取
from bs4 import BeautifulSoup #内容解析
import re #正则表达式处理
#https://www.douban.com/robots.txt
r = requests.get('https://book.douban.com/subject/1986590/comments/')
#r = requests.get('https://book.douban.com/subject/1986590/comments/hot?p=4')
print(r.status_code)
#print(r.text)
#数据解析
#markup = '<p class="title"><b>The Little Prince</b></p>'
#soup = BeautifulSoup(markup, "lxml")
#print(soup.b)
#print(soup.p)
#tag = soup.p
soup = BeautifulSoup(r.text, "lxml")
pattern = soup.find_all('p','comment-content')
for item in pattern:
print(item.string)
#re = requests.get('http://money.cnn.com/data/dow30')
pattern_s = re.compile('<span class="user-stars allstar(.*) rating"')
#<span class="user-stars allstar50 rating" title="力荐"></span>
p = re.findall(pattern_s, r.text)
sum = 0
for star in p:
sum += int(star)
print(sum)
Python网络数据抓取
最新推荐文章于 2024-08-21 16:53:21 发布