python初学爬虫,使用urllib.request模块,爬取众筹网相关内容

#python 3.6
import mysql.connector
import re
import urllib

conn = mysql.connector.connect(user='root', password='root',host = 'localhost',database = 'test')
cursor = conn.cursor()

def getHtml(url):
    req = urllib.request.urlopen(url).read()
    html = req.decode('utf-8')
    return html

def getData(html,leibie):
    reg = re.compile('<div class="ssCardICBox siteCardICBox cgjs">(.*?)<p class="scP">筹款进度</p>',re.S)
    xiangmu = re.findall(reg,html)
#    print(xiangmu)
    shengfen = ['河北','山西','辽宁','吉林','黑龙江','江苏','浙江','安徽','福建',
                    '江西','山东','河南','湖北','湖南','广东','海南','四川','贵州','云南',
                    '陕西','甘肃','青海','台湾','内蒙古','广西','西藏','宁夏','新疆','香港','澳门']
    zhixiashi = ['北京','天津','上海','重庆']
    for x in range(len(xiangmu)):
        name = re.findall('class="siteCardICH3" title="(.*.)" target="_blank"',xiangmu[x]) 
#        print(name)
        yichouzhichijindu = re.findall('<p class="ftP">(.*.)</p>',xiangmu[x])
        label = re.findall('site_ALink siteIlB_item" target="_blank">(.*)</a>',xiangmu[x])       
        index = 0
        while 1:
            if label[index] in shengfen:
                province = label[index]
                city = label[index+1]
                index += 1
                break
            elif label[index] in zhixiashi:
                province = label[index]
                city = ''
                index += 1
                break
            else:
                index += 1 
        name = name[0].replace("'","“")
        cursor.execute("""insert into test(项目名称,已筹款,支持数,筹款进度,省份or直辖市,市,类别) 
            values('%s','%s','%s','%s','%s','%s','%s')"""
            %(name,yichouzhichijindu[0][1:-1],yichouzhichijindu[1],yichouzhichijindu[2],province,city,leibie))
        conn.commit()

def endPage(html):
    temp = re.findall('normalPage">(.*)</a>',html)
    endpage = int(temp[-1])
    return endpage

def main():
    print('begin')
    for i in [1,2,3,4,5]:
        urleibie = 'http://www.zhongchou.com/browse/id-28-tid-4'+str(i)+'-sm-p'
        types = {1:'生物科技',2:'果蔬种植',3:'生态养殖',4:'茶酒饮品',5:'休闲零食'}
        leibie = types[i]
#        print('%s:'%leibie)
        ii = 1
        while 1:
            url = urleibie + str(ii)
            html = getHtml(url)
            getData(html,leibie)
            endpage = endPage(html)           
            print('page %s has finished'%ii)
            ii += 1
            if ii > endpage: break
        print('type "%s" has finished'%leibie)
    cursor.close()
    conn.close()
    print('all finished')
    
main()
大创项目做得一个关于融资达成率的东东,就自学了下爬虫,这个版本使用mysql数据库存取
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值