现在大部分网站感觉都上了反扒的东西,直接写底层很难得到准确结果
首先要安装requests
,lxml
pip方式:
pip install requests
pip install lxml
具体代码:
import requests
from lxml import etree
#构造自己的数据结果类
class xuanhuan:
def __init__(self,ranking,name,numOfWords):
self.ranking = ranking
self.name = name
self.numOfWords = numOfWords
html = requests.get('http://top.hengyan.com/xuanhuan')
selector = etree.HTML(html.text)
position = '/html/body/div[2]/div[2]/div[3]/ul[2]'
#/html/body/div[2]/div[2]/div[3] /ul[2]/li[3]
#/html/body/div[2]/div[2]/div[3] /ul[3]/li[3]
#使用xpath来进行筛选结果,将html解析成树,比直接用正则表达式及简单。
content = selector.xpath('/html/body/div[2]/div[2]/div[3]/ul')
#循环遍历其中的元素
for i in range(1,len(content)+1):
ranking = selector.xpath('/html/body/div[2]/div[2]/div[3]/ul['+str(i)+']/li[1]/text()')
name = selector.xpath('/html/body/div[2]/div[2]/div[3]/ul['+str(i)+']/li[3]/text()')
numOfWords = selector.xpath('/html/body/div[2]/div[2]/div[3]/ul['+str(i)+']/li[4]/text()')
newXuanHuan = xuanhuan(ranking,name,numOfWords)
print(newXuanHuan.ranking,end=' ')
print(newXuanHuan.name,end=' ')
print(newXuanHuan.numOfWords)
运行结果: