爬虫实战计划的第一篇:淘女郎网站图片爬取
爬虫一般都是从爬取妹子图入门的,哈哈。这个算是最简单的爬虫了,只用了request库,也是最入门级别的爬虫方法。
ps:附上GitHub链接:
淘女郎网站图片爬取
代码展示:
def open_url(url):
req = urllib.request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_MM_Data(url,currentPage=0): #获取基本信息,以及user_id
data={
'q':'',
'viewFlag':'A',
'sortType':'default',
'searchStyle':'',
'searchRegion':'city:',
'searchFansNum':'',
'currentPage':currentPage,
'pageSize':'100'
}
data=urllib.parse.urlencode(data)
urll = urllib.request.Request(url, data=bytes(data, 'utf-8'))
urll.add_header('user-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36')
response = urllib.request.urlopen(urll)
html = response.read().decode('gbk').encode('utf-8')
htmlToDict=json.loads(html)
return htmlToDict['data']['searchDOList']
def get_MM_ID(data):
ID=[]
for i in data:
ID.append(i['userId'])
return ID
def get_MM_Name(data):
for i in data:
user_name.append(i['realName'])
return user_name
def get_MM_city(data):
for i in data:
user_city.append(i['city'])
return user_city
def get_MM_Height(data):
for i in data:
user_height.append(i['height'])
return user_height
def get_MM_Weight(data):
for i in data:
user_weight.append(i['weight'])
return user_weight
def get_photos_url(url):
album_id=[]
html=urllib.request.Request(url)
html.add_header('user-agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36')
html1=urllib.request.urlopen(html).read().decode('gbk')
response=re.findall('album_id=[0-9]*',html1)
for i in response:
if i not in album_id:
album_id.append(i)
return album_id
def get_photos(url):
pic_url=[]
html = open_url(url)
html1=html.decode('gbk')
for i in json.loads(html1)['picList']:
picUrl=i['picUrl']
pic_url.append('https:'+picUrl)
return pic_url
主函数:
if __name__=='__main__':
url='https://mm.taobao.com/tstar/search/tstar_model.do?_input_charset=utf-8'
data=get_MM_Data(url,1)
if not os.path.exists('D://taoMM'): #判断路径是否存在
os.mkdir('D://taoMM') #路径不存在,新建文件夹
os.chdir('D://taoMM')
ID=get_MM_ID(data)
get_MM_city(data)
get_MM_Name(data)
get_MM_Weight(data)
get_MM_Height(data)
for j in range(0,ID.__len__()-1):
folder_name = user_name[j] + '_' + user_city[j] + '_体重' + str(user_weight[j]) + '_身高' + str(user_height[j])
if not os.path.exists('D:\\taoMM\\'+folder_name):
os.mkdir("D:\\taoMM\\"+folder_name)
album = get_photos_url(
'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={}'.format(ID[j]))
print('当前模特:{}'.format(user_name[j]))
print(album)
photos = get_photos(
'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&{1}&top_pic_id=0&cover=%2F%2Fimg.alicdn.com%2Fimgextra&page=1'.format(
ID[j], album[0]))
print(photos)
for i in photos:
name=i.split('/')[-1]
with open('D://taoMM//'+folder_name+'/'+name,'wb') as f:
try:
html=open_url(i)
except urllib.error.HTTPError:
continue
f.write(html)
效果展示: