1.正常方法贴吧爬取
import urllib.request
import urllib.parse
baseurl=‘https://tieba.baidu.com/f?’
headers={
‘User-Agent’:
‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3706.400 SLBrowser/10.0.4040.400’}
name=input(‘请输入:’)
up=int(input(‘输入起始页:’))
end=int(input('输入末尾页: '))
kw = {‘kw’:name}
kw = urllib.parse.urlencode(kw)
for i in range(up,end):
pn=(i-1)*50
url = baseurl + kw+ ‘&pn=’ + str(pn)
req= urllib.request.Request(url,headers=headers)
res = urllib.request.urlopen(req)
# html = req.read().decode('UTF-8')
html1=res.read().decode('utf-8')
filename='第'+str(i)+'页贴吧内容'
with open(filename,'w',encoding='utf-8') as f:
print('正在爬取第{}页'.format(i))
f.write(html1)
2.利用函数进行贴吧爬取
import urllib.request
import urllib.parse
#读取页面
def readage(url):
headers = {‘User-Agent’:
‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3706.400 SLBrowser/10.0.4040.400’}
res = urllib.request.Request(url, headers=headers)
req = urllib.request.urlopen(res)
html = req.read().decode('utf-8')
return html
#写入文件
def writeage(filename,html):
with open(filename, ‘w’, encoding=‘utf-8’) as f:
f.write(html)
def main():
name = input(‘请输入:’)
up = int(input(‘输入起始页:’))
end = int(input('输入末尾页: '))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(up, end+1):
pn = (i - 1) * 50
baseurl = 'https://tieba.baidu.com/f?'
url = baseurl + kw + '&pn=' + str(pn)
html=readage(url)
filename='贴吧爬取2'
writeage(filename,html)
if name ==‘main’:
main()