python爬取ppt_Python-爬蟲13-實作-3-爬取PPT網站(完整程式碼)

最新推荐文章于 2024-05-10 19:42:04 发布

weixin_39538451

最新推荐文章于 2024-05-10 19:42:04 发布

阅读量361

点赞数

文章标签： python爬取ppt

爬取PPT網站(完整程式碼)

import requests

import time

import json

from bs4 import BeautifulSoup

domain_url = 'https://www.ptt.cc'

def get_ppt_page(url):

resp = requests.get(

url=url,

cookies={'over18': '1'} # 紀錄cookies 是否年滿18歲

)

if resp.status_code != 200:

print('Invalid url:', resp.url)

return None

else:

return resp.text

def get_pageinfo(resdata,today):

soup = BeautifulSoup(resdata, 'html5lib')

#取得上一頁href

paging_div = soup.find('div', 'btn-group btn-group-paging')

#print(paging_div)

prev_url = paging_div.find_all('a')[1]['href']

#print(prev_url)

pptdata = [] # 儲存取得的文章資料

date_divs = soup.find_all('div', 'r-ent')

#print(date_divs) #先抓取

for d in date_divs:

today_data = d.find('div', 'date').text.strip() == today

#print(today_data)

if today_data : #判斷文章是否是今天發佈

# 取得推文數

push_count = d.find('div', 'nrec').text

print(push_count)

push_num = 0

if push_count:

try:

push_num = int(push_count) # 轉換字串為數字

except ValueError:

# 若轉換失敗，可能是'爆'或 'X1', 'X2', ...

# 若不是, 不做任何事，push_num 保持為 0

if push_count == '爆':

push_num = 00

elif push_count.startswith('X'):

push_num = 99

print("推文數",push_num)

# 取得文章連結

if d.find('a'): # 有超連結，代表文章存在

href = d.find('a')['href']

print("標題連結",href)

title = d.find('a').text

print("標題",title)

author = d.find('div', 'author').text if d.find('div', 'author') else '' #作者有可能是空的

print("作者",author)

pptdata.append({

'title': title,

'href': href,

'push_num': push_num,

'author': author

})

return pptdata,prev_url

if __name__ == '__main__':

ppt_page = get_ppt_page(domain_url + '/bbs/Gossiping/index.html')

if ppt_page:

#print(domain_url + '/bbs/Gossiping/index.html')

today = time.strftime("%m/%d").lstrip('0') # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式

pptdata,prev_href = get_pageinfo(ppt_page,today)

print(domain_url+prev_href)

print(pptdata)

articles=[]

#回到上一頁繼續尋找是否有今日文章

while pptdata: # 若目前頁面有今日文章則加入 articles，並回到上一頁繼續尋找是否有今日文章

articles += pptdata

pptdata = get_ppt_page(domain_url + prev_href)

pptdata, prev_url = get_pageinfo(pptdata, today)

# 計算今天有幾篇文章

print('今天有', len(pptdata), '篇文章')

hot = 3 #預設推文數要大於這個值，才算熱門文章

print('熱門文章(> %d 推):' % (hot)) #提示文字

for a in pptdata:

if int(a['push_num']) > hot: #如果推文數大於 hot

print(a['title']) #印出標題

with open('ppt1.json', 'w', encoding='utf-8') as f:

json.dump(pptdata, f, indent=10, sort_keys=True, ensure_ascii=False)

#縮排, 是否排序Key ,編碼

YiruAtStudio - 電腦影音教學

weixin_39538451

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫