#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import os
import urllib2
def getHtml(url):
from urllib2 import HTTPError
try:
page = urllib2.urlopen(url)
html = page.read();
except HTTPError:
return "http://www.moko.cc/channels/post/28/1.html"
# print html
return html
def getJpg(url):
from urllib2 import HTTPError
try:
page = urllib2.urlopen(url)
html = page.read()
except HTTPError:
temp=[]
temp.append("http://img2.moko.cc/users/3/965/289681/post/35/img2_src_8862177.jpg")
return temp
pattern = re.compile(r'<p class="picBox"><img src2="(.*)\.jpg"')
m=re.findall(pattern,html)
#添加后缀
for i in range(len(m)):
m[i]=m[i]+".jpg"
print m
return m
def getDict(html):
#得到huml里面所有的 <a href="/post/
pattern = re.compile(r'<a href="/post/(.*)\.html')
m=re.findall(pattern,html)
print "the length is: ",len(m)
print m
#得到html里面的tilte
pattern_titlename = re.compile(r'<div class="cover" cover-text="(.*)">')
titlename = re.findall(pattern_titlename,html)
print "the length of name list: ",len(titlename)
print titlename
print titlename[0]
url_name_dict={}
for url in m:
urltemp="http://www.moko.cc/post/"+str(url)+".html"
url_name_dict[urltemp]=titlename[m.index(url)]
print url_name_dict
#print the url and the title name
dict=url_name_dict
for key in dict.keys():
print "url:",key,"titlename",dict[key]
return url_name_dict
#得到每个title里面所有的jpg的url,放在一个字典里面,key 是名字 value是对应名字的所有jpg的url
def getDict_title_jpgurl(dict_url_title):
dict_title_jpgurl={}
for url in dict_url_title.keys():
print "the list of url is ",getJpg(url)
dict_title_jpgurl[dict_url_title[url]]=getJpg(url)
print "dict_title_jpgurl\n"
print dict_title_jpgurl
return dict_title_jpgurl
#保存图片,建立父亲文件路径,分别建立相关的文件夹,保存图片
def store_jpg(parentPath,dict_title_jpgurl):
# 去除首位空格
path=parentPath
path=path.strip()
#path=path.rstrip("\\")
# 判断路径是否存在 如果不存在则创建目录
isExists=os.path.exists(path)
if not isExists:
os.mkdir(path)
for title in dict_title_jpgurl:
childPath=path+"/"+str(title) #这个是linux 下的文件路径,windows下要修改
if(os.path.exists(childPath)):
continue
print childPath;
try:
os.mkdir(childPath)
except OSError:
continue
'''
if((os.mkdir(childPath))==False):
continue
'''
jpgurl=dict_title_jpgurl[title]
for url in jpgurl:
jpgname=(url.split("/"))[-1]
print "write the file",jpgname
data=urllib2.urlopen(url).read()
jpgPath=childPath+"/"+jpgname
f = file(jpgPath,"wb")
f.write(data)
f.close()
def readConfigure():
path="/home/chujiangke/Python/config.txt"
f=file(path,"r")
for line in f.readlines():
print line
line=line.strip()
line=line.rstrip()
line=line.split("/")
del line[-1]
line="/".join(line)
print line
for i in range(1,11,1):
print i
temp = "http://www.moko.cc"+line+"/"+str(i)+".html"
print temp
html=getHtml(temp)
dict_url_title=getDict(html)
dict_title_jpgUrl=getDict_title_jpgurl(dict_url_title)
store_jpg("/home/chujiangke/Python/Spider",dict_title_jpgUrl);
def run():
for i in range(1,11,1):
print i
html=getHtml("http://www.moko.cc/channels/post/28/"+str(i)+".html")
dict_url_title=getDict(html)
dict_title_jpgUrl=getDict_title_jpgurl(dict_url_title)
store_jpg("/home/chujiangke/Python/Spider",dict_title_jpgUrl);
#readConfigure()
run()
#getJpg("http://www.moko.cc/post/962832.html")
Python 爬虫把美空的图片都扒下来了
最新推荐文章于 2022-11-25 22:47:32 发布