爬取淘宝女郎照片-写给初步入门爬虫的读者
要爬取的照片示例:
python2.7爬虫代码如下:
#coding=utf-8
import urllib2
mmurl = "https://mm.taobao.com/json/request_top_list.htm?type=0&page=" # Taobao MM
i = 0 # 一共4316张
while i < 1:
url = mmurl + str(i)
print url
up = urllib2.urlopen(url)
cont = up.read()
print "***************************************"
head = "href=\"//"
tail = "\" target="
newPh = cont.find(head)
while newPh != -1:
pj = cont.find(tail, newPh + 1)
modelUrl = "https://" + cont[newPh + len(head) : pj] # Must Add "https"
if modelUrl[-3:] == "htm": # 每个网站i有10个淘女郎
print modelUrl
# 从每个首页进入每个淘宝女郎的个人主页, 问题是现在要登录, 所以目前后面的程序暂时运行不了
mup = urllib2.urlopen(modelUrl)
mcont = mup.read()
print mcont
mhead = "src=\"//"
mtail = ".jpg\">"
mph = mcont.find(mhead)
print mph
while mph != -1:
mpj = mcont.find(mtail, mph + 1)
mmUrl = "https://" + mcont[mph + len(mhead): mpj + 4] # 读取到每张图片的url
print mmUrl
mph = mcont.find(mhead, mph + 1)
newPh = cont.find(head, newPh + 1)
print "***************************************"
i += 1
print "Finished!"