先上一道普通菜色,一般性代码爬取
headers_list = [{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/81.0.4044.129 Safari/537.36'}, {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/39.0.2171.71 Safari/537.36'}, {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.64 Safari/537.11'}, {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, '
'like Gecko) Chrome/10.0.648.133 Safari/534.16'}]
headers = random.choice(headers_list)
print(headers)
baseurl = "https://www.baidu.com/s?"
name = input("请输入贴吧名:")
start = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
# 拼接url 发送请求或响应 保存数据
for i in range(start, end + 1):
# 拼接url
pn = (i - 1) * 50
baseurl = 'https://tieba.baidu.com/f?'
url = baseurl + kw + '&pn=' + str(pn)
# 发起请求
req = urllib.request.Request(url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# 写入文件
filename = '第' + str(i) + '页.html'
with open(filename, 'w', encoding='utf-8')as f:
print('正在爬取{0}贴吧第{1}张网页'.format(name, i))
f.write(html)
将代码封装到函数进行攫取
def readPage(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
# 创建请求对象
req = urllib.request.Request(url, headers=headers)
# 获取响应对象
response = urllib.request.urlopen(req)
# 读取响应对象内容
html = response.read().decode('utf-8')
return html
# 写入文件
def writePage(filename, html, name, i):
with open(filename, 'w', encoding='utf-8')as f:
f.write(html)
return '正在爬取{0}贴吧第{1}张网页'.format(name, i)
# 主函数
def main():
name = input('请输入贴吧名:')
start = int(input('请输入爬取的首页:'))
end = int(input('请输入爬取的尾页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(start, end + 1):
# 拼接url
pn = (i - 1) * 50
baseurl = 'https://tieba.baidu.com/f?'
url = baseurl + kw + '&pn=' + str(pn)
html_get = readPage(url)
filename = name + '贴吧第' + str(i) + '页.html'
printing = writePage(filename, html_get, name, i)
print(printing)
if __name__ == '__main__':
main()
使用面向对象的方式攫取
class BaiduSpider:
def __init__(self):
# 常用的不变的放在init方法里
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, '
'like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
self.baseurl = 'https://www.baidu.com/s?'
def readPage(self, url):
# 创建请求对象
req = urllib.request.Request(url, headers=self.headers)
# 获取响应对象
response = urllib.request.urlopen(req)
# 读取响应对象内容
html = response.read().decode('utf-8')
return html
# 写入文件
def writePage(self, filename, html, name, i):
with open(filename, 'w', encoding='utf-8')as f:
f.write(html)
return ('正在爬取{0}贴吧第{1}张网页'.format(name, i))
# 主函数
def main(self):
name = input('请输入贴吧名:')
start = int(input('请输入爬取的首页:'))
end = int(input('请输入爬取的尾页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(start, end + 1):
# 拼接url
pn = (i - 1) * 50
url = self.baseurl + kw + '&pn=' + str(pn)
html_get = self.readPage(url)
filename = name + '贴吧第' + str(i) + '页.html'
printing = self.writePage(filename, html_get, name, i)
print(printing)
if __name__ == '__main__':
# 若要调用类对象中的函数,则需要将该类实例化
spider = BaiduSpider()
spider.main()
大家爬取目标网站的时候一定要符合该网站spider rules,切不可作出违法的事