1.借助Postman生成Headers
如果Headers中如果 含有if-modified-since也需要拿掉。
2.分析网页
3.代码
import requests
import re
from lxml import etree
class BoardListCrawler:
headers = {
'Accept': "*/*",
'Accept-Encoding': "gzip, deflate",
'Accept-Language': "zh-CN,zh;q=0.9",
'Connection': "keep-alive",
'Host': "www.newsmth.net",
'Referer': "http://www.newsmth.net/nForum/",
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",
'X-Requested-With': "XMLHttpRequest",
'cache-control': "no-cache",
}
domain = "https://www.newsmth.net"
base_url = domain + "/nForum/section/{}?ajax"
def get_content(self,page_number):
#[format函数参考](https://blog.csdn.net/nanhuaibeian/article/details/86591202)
url = self.base_url.format(page_number)
response = requests.get(url,headers= self.headers)
return response.text
def get_board_list(self,content):
boards = []
tree = etree.HTML(content)
rows = tree.xpath("//table[@class='board-list corner']/tbody/tr")
for row in rows:
board = {}
columns = row.xpath('td')
board['url'] = columns[0].xpath('a')[0].attrib['href']
board['title'] = columns[0].xpath('a')[0].text
#主题数量
board['num_topics'] = columns[5].text
#文章数量
board['num_posts'] = columns[6].text
boards.append(board)
print(board)
if __name__ =='__main__':
blc = BoardListCrawler()
c = blc.get_content(1)
blc.get_board_list(c)
4.结果