- #!/usr/bin/python
- # -*- coding: utf-8 -*-
- import re
- import os
- import urllib2
- def getHtml(url):
- from urllib2 import HTTPError
- try:
- page = urllib2.urlopen(url)
- html = page.read();
- except HTTPError:
- return "http://www.moko.cc/channels/post/28/1.html"
- # print html
- return html
- def getJpg(url):
- from urllib2 import HTTPError
- try:
- page = urllib2.urlopen(url)
- html = page.read()
- except HTTPError:
- temp=[]
- temp.append("http://img2.moko.cc/users/3/965/289681/post/35/img2_src_8862177.jpg")
- return temp
- pattern = re.compile(r'<p class="picBox"><img src2="(.*)\.jpg"')
- m=re.findall(pattern,html)
- #添加后缀
- for i in range(len(m)):
- m[i]=m[i]+".jpg"
- print m
- return m
- def getDict(html):
- #得到huml里面所有的 <a href="/post/
- pattern = re.compile(r'<a href="/post/(.*)\.html')
- m=re.findall(pattern,html)
- print "the length is: ",len(m)
- print m
- #得到html里面的tilte
- pattern_titlename = re.compile(r'<div class="cover" cover-text="(.*)">')
- titlename = re.findall(pattern_titlename,html)
- print "the length of name list: ",len(titlename)
- print titlename
- print titlename[0]
- url_name_dict={}
- for url in m:
- urltemp="http://www.moko.cc/post/"+str(url)+".html"
- url_name_dict[urltemp]=titlename[m.index(url)]
- print url_name_dict
- #print the url and the title name
- dict=url_name_dict
- for key in dict.keys():
- print "url:",key,"titlename",dict[key]
- return url_name_dict
- #得到每个title里面所有的jpg的url,放在一个字典里面,key 是名字 value是对应名字的所有jpg的url
- def getDict_title_jpgurl(dict_url_title):
- dict_title_jpgurl={}
- for url in dict_url_title.keys():
- print "the list of url is ",getJpg(url)
- dict_title_jpgurl[dict_url_title[url]]=getJpg(url)
- print "dict_title_jpgurl\n"
- print dict_title_jpgurl
- return dict_title_jpgurl
- #保存图片,建立父亲文件路径,分别建立相关的文件夹,保存图片
- def store_jpg(parentPath,dict_title_jpgurl):
- # 去除首位空格
- path=parentPath
- path=path.strip()
- #path=path.rstrip("\\")
- # 判断路径是否存在 如果不存在则创建目录
- isExists=os.path.exists(path)
- if not isExists:
- os.mkdir(path)
- for title in dict_title_jpgurl:
- childPath=path+"/"+str(title) #这个是linux 下的文件路径,windows下要修改
- if(os.path.exists(childPath)):
- continue
- print childPath;
- try:
- os.mkdir(childPath)
- except OSError:
- continue
- '''''
- if((os.mkdir(childPath))==False):
- continue
- '''
- jpgurl=dict_title_jpgurl[title]
- for url in jpgurl:
- jpgname=(url.split("/"))[-1]
- print "write the file",jpgname
- data=urllib2.urlopen(url).read()
- jpgPath=childPath+"/"+jpgname
- f = file(jpgPath,"wb")
- f.write(data)
- f.close()
- def readConfigure():
- path="/home/chujiangke/Python/config.txt"
- f=file(path,"r")
- for line in f.readlines():
- print line
- line=line.strip()
- line=line.rstrip()
- line=line.split("/")
- del line[-1]
- line="/".join(line)
- print line
- for i in range(1,11,1):
- print i
- temp = "http://www.moko.cc"+line+"/"+str(i)+".html"
- print temp
- html=getHtml(temp)
- dict_url_title=getDict(html)
- dict_title_jpgUrl=getDict_title_jpgurl(dict_url_title)
- store_jpg("/home/chujiangke/Python/Spider",dict_title_jpgUrl);
- def run():
- for i in range(1,11,1):
- print i
- html=getHtml("http://www.moko.cc/channels/post/28/"+str(i)+".html")
- dict_url_title=getDict(html)
- dict_title_jpgUrl=getDict_title_jpgurl(dict_url_title)
- store_jpg("/home/chujiangke/Python/Spider",dict_title_jpgUrl);
- #readConfigure()
- run()
- #getJpg("http://www.moko.cc/post/962832.html")
Python 爬虫把美空的图片都扒下来了
最新推荐文章于 2023-08-21 08:00:00 发布