开发环境介绍
python 3.6
requests 第三方包
re 正则模块
pycharm 编辑器
一、先分析百度贴吧网页
以爬取头像为例,先打开头像吧网址
http://tieba.baidu.com/f?kw=头像&ie=utf-8&pn=0
按F12或右键->检查打开开发者中心,查看头像的图片地址
使用正则匹配出图片的地址
res = requests.get(url=url, headers={'User-Agent': user_agent_list[0]})
if res.status_code == 200:
html = res.text
print(html)
# <img src="" attr="56613" data-original="http://imgsrc.baidu.com/forum/wh%3D200%2C90%3B/sign=6244351eb94543a9f54ef2ce2e27a6bb/f982d143ad4bd113a2ad5cd854afa40f4afb058b.jpg" bpic="http://imgsrc.baidu.com/forum/w%3D580%3B/sign=5d094a34fe1f3a295ac8d5c6a91ebd31/a686c9177f3e67091860ae1535c79f3df9dc55fc.jpg" class="threadlist_pic j_m_pic " />
purl = '<img src="" attr="\d+" data-original=".*?" bpic="(.*?)" class="threadlist_pic j_m_pic " />'
ret = re.findall(purl, html)
将图片写入本地
for imgurl in ret:
img = requests.get(imgurl, headers={'User-Agent': user_agent_list[random.randint(0, 10)]}).content
imgname = imgurl.split('/')[-1]
with open(directory_path + imgname, 'wb') as f:
f.write(img)
print(imgname)
sleep(random.randint(30, 50) // 100)
整体代码代码:
"""
爬取页面的所有图片地址
按照地址发送请求爬取图片保存
"""
import re
import random
from time import sleep
from urllib.parse import quote
import requests
def getpic(url, directory_path):
user_agent_list = header
res = requests.get(url=url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6'})
if res.status_code == 200:
html = res.text
print(html)
# <img src="" attr="56613" data-original="http://imgsrc.baidu.com/forum/wh%3D200%2C90%3B/sign=6244351eb94543a9f54ef2ce2e27a6bb/f982d143ad4bd113a2ad5cd854afa40f4afb058b.jpg" bpic="http://imgsrc.baidu.com/forum/w%3D580%3B/sign=5d094a34fe1f3a295ac8d5c6a91ebd31/a686c9177f3e67091860ae1535c79f3df9dc55fc.jpg" class="threadlist_pic j_m_pic " />
purl = '<img src="" attr="\d+" data-original=".*?" bpic="(.*?)" class="threadlist_pic j_m_pic " />'
ret = re.findall(purl, html)
for imgurl in ret:
img = requests.get(imgurl, headers={'User-Agent': user_agent_list[random.randint(0, 10)]}).content
imgname = imgurl.split('/')[-1]
with open(directory_path + imgname, 'wb') as f:
f.write(img)
print(imgname)
sleep(random.randint(30, 50) // 100)
if __name__ == '__main__':
# http://tieba.baidu.com/f?kw=%E5%A4%B4%E5%83%8F&ie=utf-8&pn=0
proxy_list = []
# 关键词
keywords = "头像"
# 保存目录
directory_path = 'touxiang/'
# 对关键词编码
unicode_keywords = quote(keywords)
# 爬两页的内容
for i in range(0, 2):
html_url = 'http://tieba.baidu.com/f?kw=%s&ie=utf-8&pn=%s' % (unicode_keywords, i * 50)
print(html_url)
getpic(url=html_url, directory_path=directory_path)
运行结果,共爬取了252张头像
温馨提示:仅供个人学习娱乐,切勿他用!!!