代码如下
import mysql.connector
from requests_html import HTMLSession
from multiprocessing import Pool
session = HTMLSession()
def run_proc(url):
print('参数:%s' % url)
a = get(url).html.find('.main3', first=True).find('.left', first=True).find('.sons', first=True).find('a')
result = []
for j in range(len(a)):
url_info = 'https://so.gushiwen.org' + a[j].attrs['href']
print(url_info)
val = parse_html(url_info, False, 'h1', True)
if len(result) == 200:
save_db(result)
result = []
result.append(val[0])
if len(result) > 0:
save_db(result)
def run_proc_v2(url):
print('参数:%s' % url)
val = parse_html(url, True, 'b', False)
result = val['result']
total = int(val['total'])
page_no = (total + 9) // 10
url_split = url.split('.aspx')[0]
prefix = url_split[:len(url_split) - 1]
for i in range(1, page_no):
page_url = prefix + str(i) + '.aspx'
if len(result) == 200:
save_db(result)
result = []
r = parse_html(page_url, False, 'b', False)
if len(r) > 0:
for j in range(len(r)):
result.append(r[j])
if len(result) > 0:
save_db(result)
def get(url):
re = session.get(url)
re.encoding = 'utf-8'
return re
# 解析html
# url:要解析的url
# is_total 是否解析总数
# title_node 诗词名html元素节点
# flag 是否只解析一条
def parse_html(url, is_total, title_node, flag):
div = get(url).html.find('.main3', first=True).find('.left', first=True)
sons = div.find('.sons')
result = []
if sons is not None and len(sons) > 0:
for i in range(len(sons)):
values = []
son = sons[i]
# title
values.insert(0, son.find(title_node, first=True).text)
# author
source = son.find('.source', first=True).find('a')
values.insert(1, source[1].text)
# content
values.insert(2, son.find('.contson', first=True).text)
# dynasty
values.insert(3, source[0].text)
result.append(values)
if flag:
break
# 解析总数
if is_total:
total_str = div.find('.pages', first=True).find('span')[1].text
total = total_str[1:len(total_str) - 1]
val = {'result': result, 'total': total}
return val
else:
return result
def save_db(result):
if result is not None and len(result) > 0:
print('开始插入数据...')
conn = mysql.connector.connect(user='root', password='wujinlei', host='127.0.0.1', port='3307',
database='crawler')
cursor = conn.cursor()
for i in range(len(result)):
values = result[i]
cursor.execute('insert into poetry (title,author,content,dynasty) values (%s,%s,%s,%s)',
values)
conn.commit()
print('已经插入%s条数据' % len(result))
cursor.close()
conn.close()
if __name__ == "__main__":
p = Pool(8)
cate = get("https://www.gushiwen.org/shiwen/").html.find('.main3 .right', first=True).find('a')
print(len(cate))
if cate is not None and len(cate) > 0:
for i in range(len(cate)):
c = cate[i]
url = c.attrs['href']
if url[0] == '/':
url = 'https://www.gushiwen.org' + url
p.apply_async(run_proc_v2, args=(url,))
else:
p.apply_async(run_proc, args=(url,))
p.close()
p.join()