正则解析
爬取糗事百科中的某页所有糗图。
import requests
import re
import os
if __name__ == "__main__":
if not os.path.exists('./qiutuLibs'):
os.mkdir('./qiutuLibs')
url = "https://www.qiushibaike.com/imgrank/"
headers = {
'User_Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
page_text = requests.get(url=url, headers=headers).text
# with open('./page_text', 'w', encoding='utf-8') as fp:
# fp.write(page_text)
#使用聚焦爬虫对页面中所有的糗图进行抓取
extract_img_url = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(extract_img_url, page_text, re.S)
for img_src in img_src_list:
#拼接出完整的图片地址
img_src = 'https:' + img_src
img_data = requests.get(img_src, headers=headers).content
#生成图片名称
img_name = img_src.split('/')[-1]
img_path = './qiutuLibs/' + img_name
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(f'图片{img_name}保存成功!')
正则解析——分页爬取
写入循环,观察每页url的区别特点。构建一个通用的模板。
另外一种方法:
url = "https://www.qiushibaike.com/imgrank/page/%d/"
for page in range(1,3):
new_url = format(url%page)
获取到每页的url。
import requests
import re
import os
if __name__ == "__main__":
if not os.path.exists('./qiutuLibs'):
os.mkdir('./qiutuLibs')
# url = "https://www.qiushibaike.com/imgrank/page/{}/"
headers = {
'User_Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
for page in range(1,3):
url = f"https://www.qiushibaike.com/imgrank/page/{page}/"
page_text = requests.get(url=url, headers=headers).text
# with open('./page_text', 'w', encoding='utf-8') as fp:
# fp.write(page_text)
#使用聚焦爬虫对页面中所有的糗图进行抓取
extract_img_url = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(extract_img_url, page_text, re.S)
for img_src in img_src_list:
#拼接出完整的图片地址
img_src = 'https:' + img_src
img_data = requests.get(img_src, headers=headers).content
#生成图片名称
img_name = img_src.split('/')[-1]
img_path = './qiutuLibs/' + img_name
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(f'图片{img_name}保存成功!')