'''
用正则表达式爬取百度照片
'''
import requests
def get_source(url):
req = requests.get(url,headers=headers)
req.encoding = 'utf-8'
source = req.text
return source
import re
def get_img(source):
img = re.findall('"objURL":"(.*?)"',source)
print(img)
return img
def save_img(img):
for each_img in img:
name = each_img[-10]
name = re.sub('/','',name)
end = re.search(r'(\.jpg|\.png|\.jpeg|\.gif)$',name)
if end == None:
name = name + '.jpg'
with open('img/'+name,'wb') as f:
try:
r = requests.get(each_img,headers=headers)
except Exception as e:
print(e)
f.write(r.content)
import urllib.parse
import os
if __name__ == '__main__':
os.mkdir('img')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
keyword = input('请输入查询照片关键词:')
keyword = urllib.parse.quote(keyword)
page_start = int(input('请输入查询初始页码:'))
page_end = int(input('请输入查询末端页码:'))
for i in range(page_start,page_end+1):
page = str((i-1)*50)
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword +'&pn='+ page
print(url)
source = get_source(url)
img = get_img(source)
save_img(img)
'''
用xpath爬取百度照片
'''
import requests
def get_source(url):
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
return response.text
import lxml
from lxml import etree
def get_img(source):
html_element = etree.HTML(source)
img = html_element.xpath('//div/ul/li/a/img/@src')
print(img)
return img
import re
def save_img(img):
for each_img in img:
name = each_img[-10]
name = re.sub('/','',name)
end = re.search(r'(\.jpg|\.png|\.jpeg|\.gif)$',name)
if end == None:
name = name + '.jpg'
with open('img1/'+name,'wb') as f:
try:
r = requests.get(each_img,headers=headers)
except Exception as e:
print(e)
f.write(r.content)
import urllib.parse
import os
if __name__ == '__main__':
os.mkdir('img1')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'}
keyword = input('请输入查询照片关键词:')
keyword = urllib.parse.quote(keyword)
page_start = int(input('请输入查询初始页码:'))
page_end = int(input('请输入查询末端页码:'))
for i in range(page_start,page_end+1):
page = str((i-1)*50)
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word='+ keyword +'&pn='+ page
print(url)
source = get_source(url)
img = get_img(source)
save_img(img)