直接上代码:通过输入需要爬取的贴吧名字和贴吧页数进行爬取
import requests
import urllib.request
import random
class TiebaWarm:
def __init__(self,tiebaName,userAgent,page):
self.__tiebaName=tiebaName
self.__userAgent=userAgent
self.__url="https://tieba.baidu.com/f?kw="+tiebaName+"&ie=utf-8&pn={}"
self.__page=page
'''
获取需要爬虫的url列表
'''
def getUrlList(self):
self.__urlList=[]
for page in range(self.__page):
url=self.__url.format(page*50)
self.__urlList.append(url)
return self.__urlList
'''
将爬取到的网页保存到本地
'''
def saveHtml(self):
self.getUrlList()
for url in self.__urlList:
data=requests.get(url,headers=self.__userAgent).text
pageHtml=self.__urlList.index(url)+1#获取贴吧当前页数
filePath = "{}第{}页.html".format(self.__tiebaName, pageHtml)
# 保存html
with open(filePath, "w", encoding="utf-8") as f:
f.write(data)
def run(self):
self.saveHtml()
if __name__ == '__main__':
headers = [
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36"
},
{
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201"
},
{
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)"
}
]
warm=TiebaWarm("蔡徐坤",random.choice(headers),5) #开始爬取
warm.run()
Django框架做的爬虫demo:https://download.csdn.net/download/hi_sir_destroy/11388170