我们先以bs4遍历文档方法爬取
#遍历文档树的方式抓取python100例数据
import bs4,requests
#1.请求网站地址
#http://www.runoob.com/python/python-exercise-example1.html
#http://www.runoob.com/python/python-exercise-example2.html
# 可以使用循环,来生成链接
url='http://www.runoob.com/python/python-exercise-example{}.html'
for i in range(1,101):
new_url=url.format(i)
res=requests.get(new_url).content.decode('utf-8')
# print(res)
print('-'*100)
# 2.挨个的访问每一个链接,得到对应html文档
#文档传入bs4中,使用遍历文档树的方式抓取标题,题目,分析
# 标题 h1
# 题目 第二p的文本
# 分析 第三p的文本
soup=bs4.BeautifulSoup(res,"html.parser")
h1=soup.div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.h1
print(h1.text)
if h1.next_sibling.name=='p':
p2=h1.next_sibling.next_sibling.next_sibling
else:
p2 = h1.next_sibling.next_sibling.next_sibling.next_sibling
print(p2.text)
p3 = p2.next_sibling.next_sibling
print(p3.text)
# 3.把得到的数据,写入文本中
with open('python.txt','a',encoding='utf-8') as f:
f.write(h1.text+'\n')
f.write(p2.text + '\n')
f.write(p3.text + '\n')
f.write('-'*100 + '\n')
第二种lxml方法
# # coding:utf-8
import requests
from lxml import etree
# import pymysql
import chardet
import xlsxwriter
# #获取单页面
def get_one_page(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36','Connection': 'close'}
response=requests.get(url,headers=headers)
response.encoding=chardet.detect(response.content)['encoding']
return response.text
def parse_one_page(html):
#初始化
result=etree.HTML(html)
item={}
j=1
j=j+1
item['a1']=result.xpath("//*[@id='content']/ul/li['j']/a/@href")
for i in item['a1']:
yield 'http://www.runoob.com/python/'+str(i)
#第二个详细页面
def parse_two_page(html):
result = etree.HTML(html)
item2 = {}
item2['b1']=result.xpath('//*[@id="content"]/p[2]/text()')#题目
yield item2['b1']
def write_to_excell(i,file):
pass
url='http://www.runoob.com/python3/python3-examples.html'
html=get_one_page(url)
for i in parse_one_page(html):
print(i)
html = get_one_page(i)
for i in parse_two_page(html):
print(i)