代碼如下:
# -*- coding: utf-8 -*
import urllib.request,socket,re,sys,os,time,requests
from lxml import etree
with open(r'C:\Users\admin\Desktop\practice_csdn.txt','w') as f:
for p in range(1,3): #range(1,3),代表第一頁,第二頁
url='https://bbs.csdn.net/forums/Oracle?page={}'.format(p)
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Referer': 'https://bbs.csdn.net/forums/Oracle'}
response=requests.get(url=url,headers=headers)
response.encoding='utf-8'
html=response.text
s=etree.HTML(html)
time.sleep(2)
block=s.xpath('//*[@id="forums-show"]/div[3]/div[1]/div/dl/dt/h1/text()')[0]
f.write('論壇名稱:{}'.format(block)),f.write('\n')
f.write('第{}頁:\n'.format(p)),f.write('*'*120),f.write('\n')
cmax=0
pimax=s.xpath('//*[@id="forums-show"]/div[3]/div[4]/table/tr') #結構是一個table下面多個tr,每條信息在tr里面,所以用tr
for pimaxx in pimax:
cmax=cmax+1
print (cmax) #獲取總條目,結果是53,比最后一條xpath[]52大1,所以接下來用‘
count=2 #第一個標題的xpath是2,用while循環加format()方法更方便
while count
title= s.xpath('//*[@id="forums-show"]/div[3]/div[4]/table/tr[{}]/td[1]/a/text()'.format(count))[0]
sort= s.xpath('//*[@id="forums-show"]/div[3]/div[4]/table/tr[{}]/td[1]/span/a/text()'.format(count))[0]
score= s.xpath('//*[@id="forums-show"]/div[3]/div[4]/table/tr[{}]/td[2]/text()'.format(count))[0]
writer=s.xpath('//*[@id="forums-show"]/div[3]/div[4]/table/tr[{}]/td[3]/a/@title'.format(count))[0]
wtime= s.xpath('//*[@id="forums-show"]/div[3]/div[4]/table/tr[{}]/td[3]/span/text()'.format(count))[0]
count=count+1
time.sleep(1)
print ('標題:{}[{}]\n分數:{} 提問人:{} 提問時間:{}\n'.format(title,sort,score,writer,wtime),'*'*50)
f.write('標題:{}[{}]\n分數:{} 提問人:{} 提問時間:{}\n'.format(title,sort,score,writer,wtime)),f.write('*'*50),f.write('\n')
f.flush()
結果如下:
分析:
1、這里的關鍵點就在於靈活地處理xpath,靈活地運用;
2、瀏覽器復制的xpath並不一定是對的,需要自己去頁面代碼查看!
3、注意查看網頁的編碼,是否需要轉碼,最好習慣性用轉碼:str轉成bytes用encode,bytes轉成str用decode。這里因為是get()方法,數據只包含str格式,所以只能用encode()方法。簡記:gse,ubd!