练习requests爬虫
保存图片时,文件夹实现去重,图片暂没有去重
import requests
from bs4 import BeautifulSoup
import re
import os
import time
url="http://www.58gc.cn/xgmn/"
def get_all():#获取首页所有图片组的信息
url="http://www.58gc.cn/dcd/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0"}
html=requests.get(url,headers=headers)
#html.encoding="gbk"
html=BeautifulSoup(html.text,'lxml')
all_tpurl=html.select('li.wenshens > a:nth-of-type(1) ')
for i in all_tpurl:
tpz_url=i['href']#图片组链接
name=i.find('img').get('alt')#图片组名字
#print(tpz_url,name)
yield (tpz_url,name)
def get_tuzu(tz):#获取单独一组图片中每张图片的链接并下载
time.sleep(2)
url=tz[0]
name=tz[1]
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1