import urllib.request
import urllib.parse
import re
import os
import time
def handle_request(url, page):
url += str(page) + '/'
header = {'User-Agent' : ' Mozilla/5.0 (Windows NT 6.1; Win64;'
' x64) AppleWebKit/537.36 (KHTML, like'
' Gecko) Chrome/71.0.3578.98 Safari/537.36',}
request = urllib.request.Request(url=url, headers=header)
return request
def parse_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf8')
with open('F.html', 'w', encoding='utf8') as fp:
fp.write(content)
pattern = re.compile(r'<div class="thumb">.*?<a href=".*?" target="_blank">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>', re.S)
ret = pattern.findall(content)
download(ret)
def download(ret):
for image_info in ret:
image_src = image_info[0]
image_alt = image_info[1]
image_src = 'https:' + image_src
dirName = 'tupian'
if not os.path.exists(dirName):
os.mkdir(dirName)
suffix = image_src.split('.')[-1]
fileName = image_alt + '.' + suffix
filePath = os.path.join(dirName, fileName)
print(filePath)
print("正在下载%s....."%fileName)
urllib.request.urlretrieve(image_src, filePath)
print("结束下载%s"%fileName)
time.sleep(2)
def main():
start_page = int(input('请输入开始爬取的页面:'))
end_page = int(input("请输入结束的页面:"))
url = 'https://www.qiushibaike.com/pic/page/'
for page in range(start_page, end_page+1):
print('正在下载%s页.....'%page)
request = handle_request(url, page)
parse_content(request)
print("结束下载第%s页"%page)
time.sleep(1)
if __name__ == '__main__':
main()
爬取美女图片
import urllib.request
import urllib.parse
import time
import os
from lxml import etree
def handle_request(page):
if page == 1:
url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian.html'
else:
url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian_'+str(page)+'.html'
header = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64;'
' zh-CN; rv:1.9.2.10) Gecko/20100922'
' Ubuntu/10.10 (maverick) Firefox/3.6.10'}
request = urllib.request.Request(url = url, headers = header)
return request
def parse_content(content):
tree = etree.HTML(content)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
name_list = div.xpath('.//div/a/img/@alt')
images_list = div.xpath('.//div/a/img/@src2')
for img_src in images_list:
download_image(img_src)
def download_image(img_src):
dir_path = 'xinggan'
if not os.path.exists(dir_path):
os.mkdir(dir_path)
filePath = os.path.basename(img_src)
fileName = os.path.join(dir_path,filePath)
print(fileName)
header = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64;'
' zh-CN; rv:1.9.2.10) Gecko/20100922'
' Ubuntu/10.10 (maverick) Firefox/3.6.10'}
request = urllib.request.Request(url=img_src, headers=header)
response = urllib.request.urlopen(request)
with open(fileName, 'wb') as fp:
fp.write(response.read())
def main():
start_page = int(input("请输入起始页:"))
end_page = int(input("请输入结束页:"))
for page in range(start_page,end_page+1):
request = handle_request(page)
content = urllib.request.urlopen(request).read().decode()
parse_content(content)
time.sleep(1)
if __name__ == "__main__":
main()