python多进程爬取简书最新评论存储MySQL

 

 

import requests
from lxml import etree
import pymysql
from multiprocessing import Pool

conn = pymysql.connect(host='localhost', user='root', passwd='cjlushenbin', db='my_database', port=3306, charset='utf8')
cursor = conn.cursor()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}

def get_jianshu_info(url):
    html = requests.get(url, headers=headers)
    selector = etree.HTML(html.text)
    infos = selector.xpath('//ul[@class="note-list"]/li')
    for info in infos:
        try:
            author = info.xpath('div/div/a[1]/text()')[0]
            title = info.xpath('div[1]/a/text()')[0]
            content = info.xpath('div[1]/p/text()')[0].strip()
            comment = info.xpath('div/div/a[2]/text()')[1].strip()

            if info.xpath('div/div/span[1]') == info.xpath('div/div/span[@class="paid-meta"]'):
                like = info.xpath('div/div/span[2]/text()')[0].strip()
                reward = info.xpath('div/div/span[3]/text()')[0].strip()
            else:
                like = info.xpath('div/div/span[1]/text()')[0].strip()
                reward = info.xpath('div/div/span[2]/text()')
                if len(reward) == 0:
                    reward = '无'
                else:
                    reward = reward[0].strip()
            data = {
                'author':str(author),
                'title':str(title),
                'content':str(content),
                'comment':str(comment),
                'likes':str(like),
                'reward':str(reward)
            }
            table_name = 'douban'
            keys = ','.join(data.keys())
            values = ','.join(['%s']*len(data))
            sql = 'INSERT INTO {table_name}({keys}) VALUES({values})'.format(table_name=table_name,keys=keys,values=values)
            try:
                cursor.execute(sql,tuple(data.values()))
                conn.commit()
            except Exception as e:
                print(f'INSERT INTO MySQL table failed.Case:{e}')
                conn.rollback()
        except Exception as e:
            print(f'ERROR.Case:{e}')


if __name__ == '__main__':
    urls = ['http://www.jianshu.com/c/bDHhpK?order_by=commented_at&page={}'.format(str(i)) for i in range(1, 5)]
    pool = Pool(processes=4)
    pool.map(get_jianshu_info, urls)
    conn.close()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值