第一个python程序抓取

#!/usr/bin/env python
#coding:utf8
import re
import sys
from subprocess import *
import os
import time
import cPickle as pickle
import string
import random


def get_http_data(url,file=None):       
        ''' 获得页面信息'''
        if      file is  None:
           cmd='''curl -s "%s" --max-time 10 ''' %(url)
           p=Popen(cmd,stdout=PIPE,stderr=PIPE,shell=True)

           data=p.communicate()[0]
           if data:
                data = re.sub(r"\n|\r","",data)
                try:
                        data=data.decode("gbk")
                except Exception,err:
#                       data=data.decode("latin1")
                #       peri=re.findall(r".*position\s+(\d+)-(\d+)[^\d].*",str(err))
                #       peri=peri[0]
                #       peri=[int(i) for i in peri]
                #       t=data[peri[0]:peri[1]+1]
                        #data=re.sub(t,"",data)
                #       data=re.sub(r"?","",data)

                        try:
                                data=data.decode("gb18030")
                        except Exception,err:
                                print err,url

        #       print   type(data)

                return  data
        else:

                cmd='''wget -q -t 3 -c -T 15  %s -O %s ''' %(url,file)
                p=Popen(cmd,stdout=None,shell=True)
                p.wait()
                return  p.poll()

def gs_list_data(url,type,key):
        '''歌手列表页数据处理type=1歌手2专辑'''
        global res_gs,res_zj,gi,zi
        data = get_http_data(url)
        if data:
                data = re.findall('<div class="imgborder">(.*?)<\/div>',data)
                if len(data)>0:

                        for dt in data:
                                other = re.search('<a href="javascript:\{\}" οnclick="Smc.Cat.playsinger\(\'(\\d+)\'\);" title="(.*?)">',dt)
                                if other:
                                        other = other.groups()
                                        #歌手名
                                        title = other[1]
                                        print title.encode("utf8")
                                        if type == 2:
                                                #专辑列表链接
                                                zjlink = 'http://music.sina.com.cn/yueku/singer_more_album.php?id='+other[0]
                                                zj_list_data(zjlink,title,key)
                                        else:

                                                #简介链接
                                                gsdata={"name":"",
                                                        "intro":"",
                                                        "img":"",
                                                        "type":key
                                                }


                                                infolink = 'http://music.sina.com.cn/yueku/s/'+other[0]+'.html'
                                                gsinfo = gs_info_data(infolink)
                                                gsdata["name"] = title
                                                gsdata["intro"] = gsinfo
                                                imgs = re.search(r'<img src="(.*?)" (.*?)/>',dt)

                                                if imgs:
                                                        imgs = imgs.groups()
                                                        imgfile = img_pro(title,imgs[0],1)
                                                        gsdata["img"] = imgfile

                                                res_gs.append(gsdata)
                                                if len(res_gs)>=100:
                                                #       gi += 1
                                                        path = "%s/artist_%s" %(artist_path,key)
                                                        fn = get_fname(path)
                                                        gsi = open(fn,"w")
                                                        pickle.dump(res_gs,gsi)
                                                        gsi.close()
                                                        res_gs = []
        else:

                if type == 2:
                 # zi += 1
                  path = "%s/album_%s" %(album_path,key)
                  fn = get_fname(path)
                  zji = open(fn,"w")
                  pickle.dump(res_zj,zji)
                  zji.close()
                  res_zj = []
                else:
                  #gi += 1
                 # fn = "artist_%s_%s" %(key,gi)
                  path = "%s/artist_%s" %(artist_path,key)
                  fn = get_fname(path)
                  gsi = open(fn,"w")
                  pickle.dump(res_gs,gsi)
                  gsi.close()
                  res_gs = []

                return 1


def gs_info_data(url):
        '''歌手信息页面'''
        data = get_http_data(url)
        if data:
                data = re.search(r'<span class="intro" id="singer_intro"(.*?)>(.*?)</span>',data)
                if data:
                        data = data.groups()
                        info = strip_tags(data[1])#歌手信息
                        return info


def zj_list_data(url,name,key):
    global res_zj,zi,res_gs,gi
    '''专辑列表页'''
    for p in range(1,10):
        url += "&page=%s" %p
        data = get_http_data(url)
        if data:
                data = re.findall(r'<div class="img_pf">(.*?)</li>',data)
                if len(data)>0:
                        for dt in data:
                                other = re.search('<a href="(.*?)" title="(.*?)"><img (.*?) src="(.*?)" (.*?)>',dt)
                                if other:
                                        other = other.groups()
                                        zjdata = {"name":"","title":"","img":'',"intro":"","date":""}
                                        zjdata["name"] = name
                                        #标题
                                        title = other[1]
                                        print title.encode("utf8")

                                        zjdata["title"] = title
                                        #图片
                                        imgfile = img_pro(name,other[3],2,title)
                                        zjdata["img"] = imgfile
                                        #专辑链接
                                        link = other[0]
                                        zjinfo = zj_info_data(link)
                                        zjdata["intro"] = zjinfo
                                        #发行时间
                                        #dtsj = r'<div class="pub_time">(\d+)年(\d+)月(\d+)日 发行</div>'.decode("utf8")
                                        dtsj = u'<div class="pub_time">(\d+)年(\d+)月(\d+)日 发行</div>'

                                        sj=re.search(dtsj,dt)

                                        if sj:
                                                sj = sj.groups()
                                                date = '%s-%s-%s' %(sj[0],sj[1],sj[2])
                                                zjdata["date"] = date
                                        res_zj.append(zjdata)
                                        if len(res_zj) >=100:
                                                #zi += 1
                                                #fn = "album_%s_%s" %(key,zi)
                                                path = "%s/album_%s" %(album_path,key)
                                                fn = get_fname(path)
                                                zji = open(fn,"w")
                                                pickle.dump(res_zj,zji)
                                                zji.close()
                                                res_zj = []

        else:
                #zi += 1
                #fn = "album_%s_%s" %(key,zi)
                path = "%s/album_%s" %(album_path,key)
                fn = get_fname(path)
                zji = open(fn,"w")
                pickle.dump(res_zj,zji)
                zji.close()
                res_zj = []
                break


def zj_info_data(url):
        '''专辑信息'''
        data = get_http_data(url)
        if data:
                data = re.search(r'<span class="intro" id="singer_intro" (.*?)>(.*?)</span>',data)
                if data:
                        data = data.groups()
                        info = strip_tags(data[1])
                        return info


def gs_url(type):
        global res_zj,res_gs
        '''歌手链接'''      
        for i in range(1,4):
                for j in range(1,4):
                        for z in xrange(1,100):
                                url = 'http://music.sina.com.cn/category/singer.php?singer=%s_%s&p=%s' %(i,j,z)
                                print url
                                file = "%s_%s" %(i,j)

                                res = gs_list_data(url,type,file)

                                if res == 1 :#歌手列表循环100页
                                        break


                           
def strip_tags(str):
        '''过滤标签'''
        if str:
                str = re.sub(r'\n|\r|\t','',str)  
                str = re.sub(r'<br>|<br />','BR',str)
                str = re.sub(r'<.*?>','',str)
                str = re.sub(r'BR','<br/>',str)
                return str
def mdir_yiwan(path):
        '''创建10000个目录'''
        if path:
           for i in range(100):
                ph1 = path+"%02d/" %i
                if not os.path.exists(ph1):
                        os.makedirs(ph1)
                        for j in range(100):
                                ph2 = ph1+"%02d/" %j
                                if not os.path.exists(ph2):
                                        os.makedirs(ph2)


def  get_fname(path):
        '''取名称'''
     
        str_list="%s%s" %(string.digits,string.letters)
        while   True:
                suffix="".join(random.sample(str_list,6))
                fname="%s_%s"        %(path,suffix)
                if      os.path.exists(fname):
                        continue
                else:
                        return  fname

def img_pro(name,src,type=1,title=None):
    '''处理图片'''
    if name and src:
        t = repr(time.time())
        t = re.split(r'\.',t)
        fn = "%s%s.gif" %(t[0],t[1][:4])       

        if type == 1:
                path = "/www/data/music/images/artists/"
        else:
                path = "/www/data/music/images/albums/"

        path += "%s/%s/" %(t[1][:2],t[1][2:4])
         
        path += fn

        stat =  get_http_data(src,path)       

        if stat >0 :#写入错误日志
                errors = "%s\t%s\t%s\t%s\t%s\n" %(name,src,type,stat,title)
                errors = errors.encode("utf8")

                logfile = open('error_log','a')
                logfile.write(errors)
                logfile.close()


        else:
                return fn


#调用
res_gs = []
res_zj = []
gi = 0
zi = 0

album_path = "/www/scripts/stat/mp3/test/zsc/album"
artist_path = "/www/scripts/stat/mp3/test/zsc/artist"


#gs_url(2)

if      len(sys.argv)!=2:
        print   "--help\n%s type" %(sys.argv[0])
        sys.exit()
else:
        attr=int(sys.argv[1])

#gs_url(2)     
gs_url(attr)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值