010、python爬取经济学人最新列表文章,归档为本地文件
首先回顾一下获取首页最新文章列表[[a,title],…]:
def getPaperList():
url = 'https://economist.com'
req = urllib.request.Request(url=url,headers=headers, method='GET')
response = urllib.request.urlopen(req)
html = response.read()
selector = etree.HTML(html.decode('utf-8'))
goodpath='/html/body/div[1]/div[1]/div[1]/div[2]/div[1]/main[1]/div[1]/div[1]/div[1]/div[3]/ul[1]/li'
art=selector.xpath(goodpath)
awithtext = []
try:
for li in art:
ap = li.xpath('article[1]/a[1]/div[1]/h3[1]/text()')
a = li.xpath('article[1]/a[1]/@href')
awithtext.append([a[0],ap[0]])
except Exception as err:
print(err,'getMain')
finally:
return awithtext
1、接着分析要爬取的文章的html结构