import requests
from lxml import etree
url = 'http://news.baidu.com/'
headers = {
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
data = requests.get(url,headers=headers).content.decode()
xpath_data = etree.HTML(data)
"""
html_data = etree.HTML(data)
result_list = html_data.xpath('//div[contains(@id,"stickthread")]')
result_list = html_data.xpath('//head/following-sibling::*[1]')
print(len(result_list))
print(result_list)
"""
result = xpath_data.xpath('/html/head/title//text()')
result = xpath_data.xpath('//a/text()')
result = xpath_data.xpath('//a[@mon="ct=1&a=2&c=top&pn=18"]/text()')
result = xpath_data.xpath('//a[@mon="ct=1&a=2&c=top&pn=18"]/@href')
result = xpath_data.xpath('//li/a/text()')
print(result)
from lxml import etree
html = """
<html>
<body>
<ul>
<li>1
<a href="">子</a>
</li>
<li>2
<a href="">子</a>
</li>
<li>3
<a href="">子</a>
</li>
<li>4
<a href="">子</a>
</li>
<li>5
<a href="">子</a>
</li>
</ul>
</body>
</html>
"""
x_data = etree.HTML(html)
result = x_data.xpath('//li[5]/text()')
result = x_data.xpath('/html/body/ul/li/a/text()')
result = x_data.xpath('//a[2]')
print(result)
import requests
from lxml import etree
import json
class BctSpider(object):
def __init__(self):
self.base_url = 'https://www.chainnode.com/forum/61-'
self.headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
def get_response(self,url):
response = requests.get(url,headers=self.headers)
data = response.content.decode()
return data
def parse_data(self,data):
x_data = etree.HTML(data)
title_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/span/text()')
url_list = x_data.xpath('//a[@class="link-dark-major font-bold bbt-block"]/@href')
data_list = []
for index,title in enumerate(title_list):
news = {}
news['name'] = title
news['url'] = url_list[index]
data_list.append(news)
return data_list
def save_data(self,data):
data_str = json.dumps(data)
with open('news01.html','w',encoding='utf-8') as f:
f.write(data_str)
def run(self):
url = self.base_url + '1'
data = self.get_response(url)
parse_data = self.parse_data(data)
self.save_data(parse_data)
BctSpider().run()