import requests
from lxml import html
import requests
import time
url="https://www.163.com"
# //div[@class="post_body"]/div[@class="post_text"]/p
# //div[contains(@class,"threadlist_title pull_left")]/a/@href
# //div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div/a/@href
# //div[@class="tab_main clearfix"]/div[@ne-role="tab-body"]//li/a
def GetHtml():
global url
kv={'User-Agent': 'Mozilla/5.0'}
# kv2={'kw' : keywords , 'pn' : i}
time.sleep(3)
r=requests.get(url,headers=kv)
r.encoding="gbk"
texts=r.text
getLINK(texts)
# print(r.text)
# with open('wy.html','w',encoding="utf8") as TB_File:
# TB_File.write(r.text)
list_links=[]
def getLINK(texts):
# with open(File_name,encoding="utf8") as f:
global url
content=texts
content=content.replace('<!--','').replace('-->','')
LISTtree=html.etree.HTML(content)
link_text_lists=LISTtree.xpath('//div[@class="tab_main clearfix"]/div[@ne-role="tab-body"]//li/a')
# print(link_text_lists)
for link_text in link_text_lists:
text=link_text.xpath('./text()')[0] if len(link_text.xpath('./text()'))>0 else ''
link=link_text.xpath('./@href')[0].strip() if len(link_text.xpath('./@href'))>0 else ''
dict={
'text':text,
'link':link
}
list_links.append(dict)
GETtxt(list_links)
# # getIMG(link)
def GETtxt(lists):
i=0
with open('txts/newsTXT.txt','a',encoding='gbk') as f:
f.write('新闻标题\t\t\t\t\t新闻链接\n')
for list in lists:
f.write('%s\t\t%s\n'%(list['text'],list['link']))
i += 1
downEssay(list['text'],list['link'],i)
def downEssay(name,link,i):
EssayName='essays/'+str(i)+'.txt'
kv = {'User-Agent': 'Mozilla/5.0'}
r= requests.get(link,headers=kv)
print('正在下载:%s'%(name))
with open(EssayName,'a',encoding="utf8") as wy_File:
HTMLtree=html.etree.HTML(r.text)
wy_File.write(name+'\n')
list_tree=HTMLtree.xpath('//div[@class="post_body"]/div[@class="post_text"]/p/text()')
for essay in list_tree:
essay=essay.strip()
wy_File.write(essay+'\n')
# img=r.text
# img = img.replace('<!--', '').replace('-->', '')
# listree=html.etree.HTML(img)
# img_links=listree.xpath('//div[@class="p_content "]//img[@class="BDE_Image"]/@src')
def download_img(link):
kv = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(link, headers=kv)
pic_name='pics/'+link[link.rfind('/')+1:]
with open(pic_name,'wb',) as f:
f.write(r.content)
if __name__ == '__main__':
GetHtml()
网易首页新闻标题加二级页面新闻内容爬取 工具:python+xpath
最新推荐文章于 2021-12-14 21:50:10 发布