网络爬虫：依据搜索词爬取各网页上图片

最新推荐文章于 2022-03-02 22:37:58 发布

木先人

最新推荐文章于 2022-03-02 22:37:58 发布

阅读量621

点赞数

文章标签： w wa

本文链接：https://blog.csdn.net/sinat_35138974/article/details/86629567

版权

转载请注明出处！

环境配置

环境：anaconda3

python包：urllib、sys、re、BeautifulSoup

headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}

依据关键词搜索并获取各词条链接

def search(key,page_num): #key：关键字；page_num:搜索后得到的页码
url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(key) + '&pn=' + str(page_num) # word为关键词，pn是百度用来分页的..

response = urllib.request.urlopen(url) # 依据url爬取整个网页
page = response.read()

with open('search_test.txt', 'a') as all: # 将搜索到的网页词条超链接存入创建的文档中
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('h3')
for h3 in tagh3:
href = h3.find('a').get('href')
baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
real_url = baidu_url.headers['Location'] #获取原始url
if real_url.startswith('http'):
all.write(real_url + '\n')

进入每一个词条抓取该网页中图片

def getHtml(url):#依据每个词条超链接进入网页爬取
#open a url address
page = urllib.request.urlopen(url)
html = page.read()
return html

def getImg(html):
#图片正则表达式（仅列了jpg图像）
reg = r'http.*?\.jpg'
# complie the Regular Exception as an object
imgre = re.compile(reg)
html = html.decode('utf-8') #python3
print(type(html))
imglist = re.findall(imgre,html)
print(imglist)

# 该方法的核心是直接下载远程图像到本地，并以递增顺序重命名图像
x = 0

for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'image\%s.jpg' % x)
x += 1

完整代码

# -*- coding: utf-8 -*-

'''
author:Elijah Lee
desc: kiwiSpider
'''
import urllib
import sys
import re
import urllib.request
import urllib3.request
import requests

from bs4 import BeautifulSoup
from urllib import parse

# import downloadImg

def getHtml(url):
#open a url address
page = urllib.request.urlopen(url)
html = page.read()
return html

def getImg(html):
#Regular Exception
reg = r'http.*?\.jpg'
# complie the Regular Exception as an object
imgre = re.compile(reg)
html = html.decode('utf-8') #python3
print(type(html))
imglist = re.findall(imgre,html)
print(imglist)

# the core of such algrithom is to download the remote data to localhost derectly,at the same time,rename the images with a serious of increasing digital
x = 0

for imgurl in imglist:
urllib.request.urlretrieve(imgurl,'image\%s.jpg' % x)
x += 1

def search(key,page_num):
#define headfile,camouflage as browser
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}

url = 'http://www.baidu.com.cn/s?wd=' + urllib.parse.quote(key) + '&pn=' + str(page_num) # word为关键词，pn是百度用来分页的..

response = urllib.request.urlopen(url)
page = response.read()

with open('search_test.txt', 'a') as all:
soup = BeautifulSoup(page, 'lxml')
tagh3 = soup.find_all('h3')
for h3 in tagh3:
href = h3.find('a').get('href')
baidu_url = requests.get(url=href, headers=headers, allow_redirects=False)
real_url = baidu_url.headers['Location'] #get the original url
if real_url.startswith('http'):
all.write(real_url + '\n')
download(real_url)
#download imgs
def download(url):
#find imgs in every pages
html = getHtml(url)
#download imgs in folder
getImg(html)

if __name__=='__main__':
key=input('input key word:')
for page_num in range(0,30,10):
search(key,page_num)
print("over!")

参考：https://www.cnblogs.com/fnng/p/3576154.html

木先人

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
网络爬虫：依据搜索词爬取各网页上图片

转载请注明出处！目录环境配置首先定义头文件，用于模仿浏览器搜索关键词依据关键词搜索并获取各词条链接进入每一个词条抓取该网页中图片环境配置环境：anaconda3python包：urllib、sys、re、BeautifulSoup首先定义头文件，用于模仿浏览器搜索关键词headers = { 'Accept': 'text/html,applicat...
复制链接

扫一扫