import requests
from bs4 import BeautifulSoup
import bs4
import re
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,rlist,html):
count=0
soup = BeautifulSoup(html,"html.parser")
for tg in soup.find_all("div",attrs ={"class":"pl2"}):
name = tg.find("a")
ulist.append(name.text.split()[0])
if tg.find_all("span",attrs={"class":"rating_nums"}):
rate = tg.find("span",attrs={"class":"rating_nums"})
rlist.append(rate.text)
else:
rlist.append("无评价")
print ("{} : {}".format(ulist[count],rlist[count]))
count+=1
def main():
sumz=0
lst=[]
while sumz<=980:
lst.append(sumz)
sumz=sumz+20
for n in lst:
uinfo = []
rinfo=[]
url = "https://movie.douban.com/tag/%E9%9F%A9%E5%9B%BD?start="+str(n)+"&type=S"
html = getHTMLText(url)
fillUnivList(uinfo,rinfo, html)
main()
re.compile是表示一组字符串,
1、for tg in soup.find_all(re.compile(“name”))
默认遍历标签的名称,没有则返回[]
2、for tg in soup.find_all(id=recompile(“???”))
默认遍历标签的属性,匹配则返回含该属性标签
3、for tg in soup.find_all(“div”,”pl2”)
默认遍历含属性值含”pl2”字符串,名称为”div”的标签,返回标签
4、for tg in soup.find_all(string=”>>>”)
遍历标签内的string文本,返回文本
5、若直接使用soup.find_all(re.compile(“abc”))
name中含abc 的均返回
其他同理