python初学爬虫，使用urllib.request模块，爬取众筹网相关内容

最新推荐文章于 2020-10-24 13:22:29 发布

Lin9977

最新推荐文章于 2020-10-24 13:22:29 发布

阅读量626

点赞数

分类专栏： python 文章标签： python爬虫

本文链接：https://blog.csdn.net/Lin9977/article/details/80083090

版权

python 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

#python 3.6

import mysql.connector
import re
import urllib

conn = mysql.connector.connect(user='root', password='root',host = 'localhost',database = 'test')
cursor = conn.cursor()

def getHtml(url):
    req = urllib.request.urlopen(url).read()
    html = req.decode('utf-8')
    return html

def getData(html,leibie):
    reg = re.compile('<div class="ssCardICBox siteCardICBox cgjs">(.*?)<p class="scP">筹款进度</p>',re.S)
    xiangmu = re.findall(reg,html)
#    print(xiangmu)
    shengfen = ['河北','山西','辽宁','吉林','黑龙江','江苏','浙江','安徽','福建',
                    '江西','山东','河南','湖北','湖南','广东','海南','四川','贵州','云南',
                    '陕西','甘肃','青海','台湾','内蒙古','广西','西藏','宁夏','新疆','香港','澳门']
    zhixiashi = ['北京','天津','上海','重庆']
    for x in range(len(xiangmu)):
        name = re.findall('class="siteCardICH3" title="(.*.)" target="_blank"',xiangmu[x]) 
#        print(name)
        yichouzhichijindu = re.findall('<p class="ftP">(.*.)</p>',xiangmu[x])
        label = re.findall('site_ALink siteIlB_item" target="_blank">(.*)</a>',xiangmu[x])       
        index = 0
        while 1:
            if label[index] in shengfen:
                province = label[index]
                city = label[index+1]
                index += 1
                break
            elif label[index] in zhixiashi:
                province = label[index]
                city = ''
                index += 1
                break
            else:
                index += 1 
        name = name[0].replace("'","“")
        cursor.execute("""insert into test(项目名称,已筹款,支持数,筹款进度,省份or直辖市,市,类别) 
            values('%s','%s','%s','%s','%s','%s','%s')"""
            %(name,yichouzhichijindu[0][1:-1],yichouzhichijindu[1],yichouzhichijindu[2],province,city,leibie))
        conn.commit()

def endPage(html):
    temp = re.findall('normalPage">(.*)</a>',html)
    endpage = int(temp[-1])
    return endpage

def main():
    print('begin')
    for i in [1,2,3,4,5]:
        urleibie = 'http://www.zhongchou.com/browse/id-28-tid-4'+str(i)+'-sm-p'
        types = {1:'生物科技',2:'果蔬种植',3:'生态养殖',4:'茶酒饮品',5:'休闲零食'}
        leibie = types[i]
#        print('%s：'%leibie)
        ii = 1
        while 1:
            url = urleibie + str(ii)
            html = getHtml(url)
            getData(html,leibie)
            endpage = endPage(html)           
            print('page %s has finished'%ii)
            ii += 1
            if ii > endpage: break
        print('type "%s" has finished'%leibie)
    cursor.close()
    conn.close()
    print('all finished')
    
main()

大创项目做得一个关于融资达成率的东东，就自学了下爬虫，这个版本使用mysql数据库存取