1、导入第三方库
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import random
2、获取网页内容
def fetchURL(url):
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
}
try:
r = requests.get(url,headers=headers)
r.raise_for_status()
return r.text
except requests.HTTPError as e:
print(e)
print("HTTPError")
except requests.RequestException as e:
print(e)
except:
print("Unknown Error !")
3、解析网页内容
def parserHtml(html):
soup = BeautifulSoup(html,"html.parser")
tbodys = soup.find_all('tbody')
data = []
for i in range(1,len(tbodys)):
comment = tbodys[i]
try:
ci = comment.cite.a['href']
uid = re.findall(r"(\d+)",ci)[0]
except:
uid = None
tmp = comment.find(onclick="atarget(this)")
title = tmp.string
tid = re.findall(r"(\d+)",tmp['href'])[0]
co = {
'tid':tid,
'title':title,
'uid':uid
}
data.append(co)
return pd.DataFrame(data)
4、存储爬取信息
def save_file(data_df):
data_df.to_csv('/root/discuz_result.txt', mode='a',encoding='utf_8_sig',header = False,index=False)
print("successed")
5、主函数
if __name__ == '__main__':
for i in range(1,20):
url = 'https://www.discuz.net/forum-developer-{}.html'.format(i)
html = fetchURL(url)
data = parserHtml(html)
save_file(data)
time.sleep(random.randint(1,3))
print("All successed ")
7、将数据存入数据库
create database hive_db;
use hive_db;
create table crawl_discuz(tid int,title string,uid int)row format delimited fields terminated by ',';
load data local inpath '/root/discuz_result.txt' into table crawl_discuz;