# 抓取糗事百科的内容
import requests
from lxml import etree
class Qiushi() :
def __init__(self) :
self.url = 'http://www.qiushibaike.com/8hr/page/{}';
self.headers = {
"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"
};
def parse_url(self , url) :
response = requests.get(url , timeout = 10 , headers = self.headers)
assert response.status_code == 200
print(url)
return etree.HTML(response.text)
def parse_content(self , html) :
item = html.xpath('//div[@class="recommend-article"]/ul/li')
print(item)
for i in item :
# 内容
print(i.xpath('./div/a[@class="recmd-content"]/text()'));
# 好笑数
print(i.xpath('./div/div[@class="recmd-detail clearfix"]/div/span[1]/text()'))
# 评论数
print(i.xpath('./div/div[@class="recmd-detail clearfix"]/div/span[4]/text()'))
# 用户名
print(i.xpath('./div/div[@class="recmd-detail clearfix"]/a/span/text()'))
# 头像地址
print(i.xpath('./div/div[@class="recmd-detail clearfix"]/a/img/@src'))
def run(self) :
url = self.url.format(1);
html = self.parse_url(url);
self.parse_content(html);
if __name__ == '__main__':
qiu = Qiushi();
qiu.run();