抓取博客文章
分析网页结构,获取列表页所有URL,不同的列表页每次只有后边的数字发生变化,这就好办了,那就去找后边的数字
F12看一下,找到了,复制连接看一下,发现没什么问题,接下来就取它呗。
用Xpath并不好取,取出来还有其他的URL,费事,用正则简单多了,欧里给,干就完了
response = requests.get(self.url,headers=self.header).text
demo = re.compile('<a href=\"(\/wiki\/\d+\/\d*)\".*?>.*?<\/a>',re.S)
lists = demo.findall(response)
for i in lists:
html = 'https://www.liaoxuefeng.com'+i
看一下获取的没什么毛病,那就请求所有URL,抓取信息就OK了,直接上代码
# 作者:胖虎
import requests
import time
from lxml import etree
import re
class Bot:
def __init__(self):
self.url = 'https://www.liaoxuefeng.com/wiki/1016959663602400'
self.header = {
'Referer': 'https://www.liaoxuefeng.com/wiki/1016959663602400',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Host':'www.liaoxuefeng.com'
}
def get_html(self):
response = requests.get(self.url,headers=self.header).text
demo = re.compile('<a href=\"(\/wiki\/\d+\/\d*)\".*?>.*?<\/a>',re.S)
lists = demo.findall(response)
for i in lists:
html = 'https://www.liaoxuefeng.com'+i
time.sleep(5)
s = requests.get(html,headers= self.header)
soucer = etree.HTML(s.text)
html1 = soucer.xpath('//*[@id="x-content"]/div[2]/p/text()')
print(html1)
for i in html1:
print(i)
print('________________________________________________________________________-')
if __name__ == '__main__':
dp = Bot()
dp.get_html()