本周学习了些爬虫的知识
1.获取HTML页面中的数据
-- coding:UTF-8 --
import requests
if name == ‘main’:
target = http ?/www.biqukan.com/1_1094/5403177.html
req = requests.get(url=target)
print(req.text)
2.
beautiful soap
使用beautiful soap 来获取我们想要的内容
-- coding:UTF-8 --
from bs4 import BeautifulSoup
import requests
if name == “main”:
target = ‘http://www.biqukan.com/1_1094/5403177.html’
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all(‘div’, class_ = ‘showtxt’) print(texts)
去除div 标签
-- coding:UTF-8 --
from bs4 import BeautifulSoup
import requests
if name == “main”:
target = ‘http://www.biqukan.com/1_1094/5403177.html’
req = requests.get(url = target) html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all(‘div’, class_ = ‘showtxt’)
print(texts[0].text.replace(’\xa0’*8,’\n\n’))