#!/usr/bin/env python
#coding:utf8
import re
import sys
from subprocess import *
import os
import time
import cPickle as pickle
import string
import random
def get_http_data(url,file=None):
''' 获得页面信息'''
if file is None:
cmd='''curl -s "%s" --max-time 10 ''' %(url)
p=Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)
data=p.communicate()[0]
if data:
data = re.sub(r"\n|\r","",data)
try:
data=data.decode("gbk")
except Exception,err:
# data=data.decode("latin1")
# peri=re.findall(r".*position\s+(\d+)-(\d+)[^\d].*",str(err))
# peri=peri[0]
# peri=[int(i) for i in peri]
# t=data[peri[0]:peri[1]+1]
#data=re.sub(t,"",data)
# data=re.sub(r"?","",data)
try:
data=data.decode("gb18030")
except Exception,err:
print err,url
# print type(data)
return data
else:
cmd='''wget -q -t 3 -c -T 15 %s -O %s ''' %(url,file)
p=Popen(cmd,stdout=None,shell=True)
p.wait()
return p.poll()
def gs_list_data(url,type,key):
'''歌手列表页数据处理type=1歌手2专辑'''
global res_gs,res_zj,gi,zi
data = get_http_data(url)
if data:
data = re.findall('<div class="imgborder">(.*?)<\/div>',data)
if len(data)>0:
for dt in data:
other = re.search('<a href="javascript:\{\}" οnclick="Smc.Cat.playsinger\(\'(\\d+)\'\);" title="(.*?)">',dt)
if other:
other = other.groups()
#歌手名
title = other[1]
print title.encode("utf8")
if type == 2:
#专辑列表链接
zjlink = 'http://music.sina.com.cn/yueku/singer_more_album.php?id='+other[0]
zj_list_data(zjlink,title,key)
else:
#简介链接
gsdata={"name":"",
"intro":"",
"img":"",
"type":key
}
infolink = 'http://music.sina.com.cn/yueku/s/'+other[0]+'.html'
gsinfo = gs_info_data(infolink)
gsdata["name"] = title
gsdata["intro"] = gsinfo
imgs = re.search(r'<img src="(.*?)" (.*?)/>',dt)
if imgs:
imgs = imgs.groups()
imgfile = img_pro(title,imgs[0],1)
gsdata["img"] = imgfile
res_gs.append(gsdata)
if len(res_gs)>=100:
# gi += 1
path = "%s/artist_%s" %(artist_path,key)
fn = get_fname(path)
gsi = open(fn,"w")
pickle.dump(res_gs,gsi)
gsi.close()
res_gs = []
else:
if type == 2:
# zi += 1
path = "%s/album_%s" %(album_path,key)
fn = get_fname(path)
zji = open(fn,"w")
pickle.dump(res_zj,zji)
zji.close()
res_zj = []
else:
#gi += 1
# fn = "artist_%s_%s" %(key,gi)
path = "%s/artist_%s" %(artist_path,key)
fn = get_fname(path)
gsi = open(fn,"w")
pickle.dump(res_gs,gsi)
gsi.close()
res_gs = []
return 1
def gs_info_data(url):
'''歌手信息页面'''
data = get_http_data(url)
if data:
data = re.search(r'<span class="intro" id="singer_intro"(.*?)>(.*?)</span>',data)
if data:
data = data.groups()
info = strip_tags(data[1])#歌手信息
return info
def zj_list_data(url,name,key):
global res_zj,zi,res_gs,gi
'''专辑列表页'''
for p in range(1,10):
url += "&page=%s" %p
data = get_http_data(url)
if data:
data = re.findall(r'<div class="img_pf">(.*?)</li>',data)
if len(data)>0:
for dt in data:
other = re.search('<a href="(.*?)" title="(.*?)"><img (.*?) src="(.*?)" (.*?)>',dt)
if other:
other = other.groups()
zjdata = {"name":"","title":"","img":'',"intro":"","date":""}
zjdata["name"] = name
#标题
title = other[1]
print title.encode("utf8")
zjdata["title"] = title
#图片
imgfile = img_pro(name,other[3],2,title)
zjdata["img"] = imgfile
#专辑链接
link = other[0]
zjinfo = zj_info_data(link)
zjdata["intro"] = zjinfo
#发行时间
#dtsj = r'<div class="pub_time">(\d+)年(\d+)月(\d+)日 发行</div>'.decode("utf8")
dtsj = u'<div class="pub_time">(\d+)年(\d+)月(\d+)日 发行</div>'
sj=re.search(dtsj,dt)
if sj:
sj = sj.groups()
date = '%s-%s-%s' %(sj[0],sj[1],sj[2])
zjdata["date"] = date
res_zj.append(zjdata)
if len(res_zj) >=100:
#zi += 1
#fn = "album_%s_%s" %(key,zi)
path = "%s/album_%s" %(album_path,key)
fn = get_fname(path)
zji = open(fn,"w")
pickle.dump(res_zj,zji)
zji.close()
res_zj = []
else:
#zi += 1
#fn = "album_%s_%s" %(key,zi)
path = "%s/album_%s" %(album_path,key)
fn = get_fname(path)
zji = open(fn,"w")
pickle.dump(res_zj,zji)
zji.close()
res_zj = []
break
def zj_info_data(url):
'''专辑信息'''
data = get_http_data(url)
if data:
data = re.search(r'<span class="intro" id="singer_intro" (.*?)>(.*?)</span>',data)
if data:
data = data.groups()
info = strip_tags(data[1])
return info
def gs_url(type):
global res_zj,res_gs
'''歌手链接'''
for i in range(1,4):
for j in range(1,4):
for z in xrange(1,100):
url = 'http://music.sina.com.cn/category/singer.php?singer=%s_%s&p=%s' %(i,j,z)
print url
file = "%s_%s" %(i,j)
res = gs_list_data(url,type,file)
if res == 1 :#歌手列表循环100页
break
def strip_tags(str):
'''过滤标签'''
if str:
str = re.sub(r'\n|\r|\t','',str)
str = re.sub(r'<br>|<br />','BR',str)
str = re.sub(r'<.*?>','',str)
str = re.sub(r'BR','<br/>',str)
return str
def mdir_yiwan(path):
'''创建10000个目录'''
if path:
for i in range(100):
ph1 = path+"%02d/" %i
if not os.path.exists(ph1):
os.makedirs(ph1)
for j in range(100):
ph2 = ph1+"%02d/" %j
if not os.path.exists(ph2):
os.makedirs(ph2)
def get_fname(path):
'''取名称'''
str_list="%s%s" %(string.digits,string.letters)
while True:
suffix="".join(random.sample(str_list,6))
fname="%s_%s" %(path,suffix)
if os.path.exists(fname):
continue
else:
return fname
def img_pro(name,src,type=1,title=None):
'''处理图片'''
if name and src:
t = repr(time.time())
t = re.split(r'\.',t)
fn = "%s%s.gif" %(t[0],t[1][:4])
if type == 1:
path = "/www/data/music/images/artists/"
else:
path = "/www/data/music/images/albums/"
path += "%s/%s/" %(t[1][:2],t[1][2:4])
path += fn
stat = get_http_data(src,path)
if stat >0 :#写入错误日志
errors = "%s\t%s\t%s\t%s\t%s\n" %(name,src,type,stat,title)
errors = errors.encode("utf8")
logfile = open('error_log','a')
logfile.write(errors)
logfile.close()
else:
return fn
#调用
res_gs = []
res_zj = []
gi = 0
zi = 0
album_path = "/www/scripts/stat/mp3/test/zsc/album"
artist_path = "/www/scripts/stat/mp3/test/zsc/artist"
#gs_url(2)
if len(sys.argv)!=2:
print "--help\n%s type" %(sys.argv[0])
sys.exit()
else:
attr=int(sys.argv[1])
#gs_url(2)
gs_url(attr)