二十、python xpath介绍和新闻内容爬虫
Xpath介绍
用xpath提取感兴趣的内容
一个网页文档是一个半结构化的数据,其实html文档就是一个树形结构。根节点是html
用正则表达式也可以提取,但是不如xpath方便
1、路径表示法
//定位根节点
/text(): 提取文本的内容
/@attr:提取属性的内容
2、筛选条件
/div[@id]
/div[@id="content_id"]
/book[price>100] #按照节点的值
问题:先再windows中安装lxml ???
前提是先装好:pip
C:\Users\lyd>pip install lxml
Xpath使用
#获取新闻列表
import requests
from lxml import etree
import datetime
#根据跟url获取该文档中的新闻列表的信息
def getNewsUrlList(baseUrl):
x = requests.get(baseUrl)
html = x.content.decode('gbk')
selector = etree.HTML(html)
contents = selector.xpath('//div[@id="content_right"]/div[@class="content_list"]/ul/li[div]')
for eachLink in contents:
url = eachLink.xpath('div/a/@href')[0]
title = eachLink.xpath('div/a/text()')[0]
ptime = eachLink.xpath('div[@class="dd_time"]/text()')[0]
yield(title, url, ptime)
#根据具体的新闻的url获取该新闻的内容
def getNewsContent(newsUrlList):
for title, url, ptime in newUrlList:
x = requests.get(url)
html = x.content.decode('gbk') #整个页面的编码为gbk2312
selector = etree.HTML(html)
constants = selector.xpath('//div[@class="left_zw"]/p/text()')
news = '\r\n'.join(constants) #在每一个p标签都回车换行。 \r\n: linux中的换行,就相当于wind中的\n
yield title, url, ptime, news
# 获取昨天的日期
def getYesterday(i):
today = datetime.date.today()
oneday = datetime.timedelta(days=i)
yesterday = today - oneday
return yesterday.strftime("%m%d")
if __name__ == "__main__":
urlTemplate = 'http://www.chinanews.com/scroll-news/mil/{0}/{1}{2}/news.shtml'
#http://www.chinanews.com/scroll-news/2017/0719/news.shtml
#http://www.chinanews.com/scroll-news/mil/2017/0717/news.shtml
testurl = urlTemplate.format('2017', '7', '20')
#print testurl
# newUrlList = getNewsUrlList(testurl)
# for title, url, ptime in newUrlList:
# print title, url, ptime
# newsConstant = getNewsContent(newUrlList)
# f = open('news.txt','w') #以写的方式打开一个文件
# w = lambda x: f.write((x+u'\r\n').encode('utf-8'))
# for title, url, ptime, news in newsConstant:
# w(u'~'*100)
# w(title)
# w(url)
# w(news)
# f.close()
#########################如下是作业:####################################
#爬去今天以前n多天的新闻数据
for i in range(0, 10):
yesterday = getYesterday(i)
urls = '%s%s%s' % ("http://www.chinanews.com/scroll-news/mil/2017/", yesterday, '/news.shtml')
newUrlList = getNewsUrlList(urls)
for title, url, ptime in newUrlList:
print title, url, ptime
# newsConstant = getNewsContent(newUrlList)
# f = open('news.txt','w') #以写的方式打开一个文件
# w = lambda x: f.write((x+u'\r\n').encode('utf-8'))
# for title, url, ptime, news in newsConstant:
# w(u'~'*100)
# w(title)
# w(url)
# w(news)
# f.close()
-----------------------------------------------------------------------------------------------------------------------