动机:
要去澳洲留学,要时刻关注澳洲的动向,每次浏览太慢,直接爬下来方便多了。
代码:
采用了多线程,下载数据的时候快一些
import os
import requests
from lxml import etree
import threading
def request_page(url):
response = requests.get(url,headers=headers)
txt = response.text
return txt
def txt_to_html(txt):
html = etree.HTML(txt)
return html
def html_to_hrefs(html,filter_condition=''):
hrefs = html.xpath(filter_condition)
return hrefs
def pages_url_construction(hrefs):
pages_url = []
for href in hrefs:
href = 'https://www.xkb.com.au' + href
pages_url.append(href)
return pages_url
def html_to_pagetexts(html,filter_condition=''):
text = html.xpath(filter_condition)
return text
def html_to_page_title(html,filter_condition=''):
titles = html.xpath(filter_condition)
return titles
def request_every_page_data(pages_url):
diction = {}
for page_url in pages_url:
txt = request_page(page_url)
html = txt_to_html(txt)
page_texts = html_to_pagetexts(html,filter_condition='//div[@class="article-cont-val"]//p/text()') #lst 每一项是一句话
page_title = html_to_page_title(html,filter_condition='//div[@class="article-cont-head"]//h1/text()')[0] #str
special_letter = '?\\*,?'
for i in page_title:
if i in special_letter:
page_title = page_title.replace(i,"")
texts = ''
for sentence in page_texts:
texts += sentence+'\n'
diction[page_title] = texts
return diction
def write_texts_to_local(diction,save_dir):
for title in diction.keys():
file = os.path.join(save_dir,title+'.txt')
with open(file,'w',encoding='utf-8') as f:
f.write(title+'\n\n')
f.write(diction[title])
def main(url):
txt = request_page(url)
html = txt_to_html(txt)
hrefs = html_to_hrefs(html, filter_condition='//div[@class="second-headline-cont2-div1-pteam"]//a/@href')
pages_url = pages_url_construction(hrefs)
diction = request_every_page_data(pages_url)
write_texts_to_local(diction, save_dir='./澳洲新闻')
if __name__ == '__main__':
web_lst = ['https://www.xkb.com.au/index.php/news/caijing',
'https://www.xkb.com.au/index.php/news/shizheng',
'https://www.xkb.com.au/index.php/news/guojixinwen']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
}
print(web_lst[0])
thread1 = threading.Thread(target=main,args=(web_lst[0],))
thread2 = threading.Thread(target=main,args=(web_lst[1],))
thread3 = threading.Thread(target=main,args=(web_lst[2],))
thread1.start()
thread2.start()
thread3.start()