import requests
from lxml import etree
import pymysql
from multiprocessing import Pool
conn = pymysql.connect(host='localhost', user='root', passwd='cjlushenbin', db='my_database', port=3306, charset='utf8')
cursor = conn.cursor()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'
}
def get_jianshu_info(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
infos = selector.xpath('//ul[@class="note-list"]/li')
for info in infos:
try:
author = info.xpath('div/div/a[1]/text()')[0]
title = info.xpath('div[1]/a/text()')[0]
content = info.xpath('div[1]/p/text()')[0].strip()
comment = info.xpath('div/div/a[2]/text()')[1].strip()
if info.xpath('div/div/span[1]') == info.xpath('div/div/span[@class="paid-meta"]'):
like = info.xpath('div/div/span[2]/text()')[0].strip()
reward = info.xpath('div/div/span[3]/text()')[0].strip()
else:
like = info.xpath('div/div/span[1]/text()')[0].strip()
reward = info.xpath('div/div/span[2]/text()')
if len(reward) == 0:
reward = '无'
else:
reward = reward[0].strip()
data = {
'author':str(author),
'title':str(title),
'content':str(content),
'comment':str(comment),
'likes':str(like),
'reward':str(reward)
}
table_name = 'douban'
keys = ','.join(data.keys())
values = ','.join(['%s']*len(data))
sql = 'INSERT INTO {table_name}({keys}) VALUES({values})'.format(table_name=table_name,keys=keys,values=values)
try:
cursor.execute(sql,tuple(data.values()))
conn.commit()
except Exception as e:
print(f'INSERT INTO MySQL table failed.Case:{e}')
conn.rollback()
except Exception as e:
print(f'ERROR.Case:{e}')
if __name__ == '__main__':
urls = ['http://www.jianshu.com/c/bDHhpK?order_by=commented_at&page={}'.format(str(i)) for i in range(1, 5)]
pool = Pool(processes=4)
pool.map(get_jianshu_info, urls)
conn.close()