print("表格操作") # 表格库 import csv # a+:追加模式 newline:禁止空行加载数据 # 写 # with open("tv.csv","w",encoding="utf-8",newline="") as openFile: # data=["剧名","男主","女主"] # csv.writer(openFile).writerow(data) # datas=[["回家的诱惑","红实现","品如"],["再见难逃","鹿晗","乔欣"],["澄清了","哈哈哈","啊啊啊"]] # csv.writer(openFile).writerows(datas) # print("数据装入成功") # openFile.close() # 读取 # with open("tv.csv","r",encoding="utf-8") as tvFile: # data =csv.reader(tvFile) # for i in data: # print(i) print("豆瓣电影数据爬取") #导入随机数的库 import random #导入访问网址的库 import urllib.request #导入bs4框架 bs4→解析代码,变成看得懂的标签 from bs4 import BeautifulSoup import csv import time #因为豆瓣网有反爬虫机制,模拟请求头去访问豆瓣网,防止访问多页时被拒绝 #每个浏览器访问数据的时候,请求头不同 #(url) 传参 def getRequest(url): #谷歌 heade1={ "Host": "movie.douban.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } #火狐 heade2={ "Host": "movie.douban.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0" } #随机取一个请求头 ran=random.choice([heade1,heade2]) #Request 带着请求头访问网址 req=urllib.request.Request(url=url,headers=ran) return req #爬取数据的函数 def getData(url): dataList=[] #调用请求头的函数 req=getRequest(url) #打开(被处理过请求头的)网址 html=urllib.request.urlopen(req) #读取数据 data=html.read() # print(data) #解析数据,将进制的数据解析成标签 soup=BeautifulSoup(data,"html.parser") #print(soup) #解锁所有的div,找到需要的div #comments=soup.find_all("div",attrs={"id":"comments"}) #找到id名为comments的div [0] comments = soup.select("#comments")[0] # 输出comments=[<div><div>] #找到class名为comment-item的div,里面装有每一条评论 commentItem=comments.select(".comment-item") #输出 comments=[div,div,div,div...] #i 是每一个评论的div for i in commentItem: short=i.select(".short")[0].string #提取用户名---clas名为comment-info的div里的a标签 author=i.select(".comment-info")[0].select("a")[0].string #print(author) #星级----class名为comment-info 的div里的span,title属性 star=i.select(".comment-info")[0].select("span")[1]["title"] #star=【看过,星级,时间】 # print(star) #字典 将 dict={"author":author,"star":star,"short":short} #将字典装入列表 dataList.append(dict) # print(dataList) return dataList #将数据装入表格 def dataInCsv(dataList): with open("douban.csv","a+",encoding="utf-8",newline="") as openFile: #循环数据 for i in dataList: #读出每个键 info=[i["author"],i["star"],i["short"]] #将数据装入表格 csv.writer(openFile).writerow(info) print("数据装入表格完毕") openFile.close() #函数入口 if __name__=='__main__': for i in range(5): url="https://movie.douban.com/subject/26357307/comments?start=%s&limit=20&sort=new_score&status=P"%(i*20) print("爬取第%s页数据"%(i+1)) dataList=getData(url) # 梅爬取一页休息5秒钟 time.sleep(5) dataInCsv(dataList) # print(dataList) # id→# class→.
python爬取豆瓣网花木兰短评
最新推荐文章于 2024-10-08 12:37:10 发布