import requests
import re
from bs4 import BeautifulSoup
def getHTML(url): # 得到网页
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
'(KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'} # 猫眼网防扒了,所以改了一下头,让其认为自己不是计算机
html = requests.get(url, headers=headers)
html.encoding = "UTF-8"
return html.text
def fenxi(html,g): # 网页的分析
infoDICT = {}
name = []
star = []
time = []
soup = BeautifulSoup(html, "html.parser")
soup = soup.find_all("dd") # 搜索dd标签
for i in soup:
c = i.find_all('p', attrs={'class': "name"})[0].string # 这里是对'p', attrs={'class': "name"}标签进行搜索.string可以直接去除其中的字符
s = i.find_all('p', attrs={'class': "star"})[0].string.split()
t = i.find_all('p', attrs={'class': "releasetime"})[0].string
name.append(c)
star.append(s[0])
time.append(t)
with open(r"F:\t.txt", "a",encoding='utf-8') as f: # 用a写入要不然不支持,即未见写入不支持str类型
for i in range(len(name)):
f.write(str(g)+str(i)+" "+str(name[i])+" " +str(time[i]) +" "+ str(star[i]) + "\n")
print(name, star, time)
def main():
for i in range(0,10):
url = "http://maoyan.com/board/4?offset=0"+str(i)+"0"
html = getHTML(url)
fenxi(html,i)
main()
爬虫关键时理解find_all的返回类型与正则表达,还有反扒,其余的基本固定