python100爬取

最新推荐文章于 2023-06-10 19:58:25 发布

Happy丶lazy

最新推荐文章于 2023-06-10 19:58:25 发布

阅读量225

点赞数

分类专栏：爬虫文章标签：爬虫 python100例 https://www.runoob.com爬取

本文链接：https://blog.csdn.net/qq_39309652/article/details/100020326

版权

爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

我们先以bs4遍历文档方法爬取

#遍历文档树的方式抓取python100例数据
import bs4,requests
#1.请求网站地址
#http://www.runoob.com/python/python-exercise-example1.html
#http://www.runoob.com/python/python-exercise-example2.html
# 可以使用循环，来生成链接
url='http://www.runoob.com/python/python-exercise-example{}.html'
for i in range(1,101):
    new_url=url.format(i)
    res=requests.get(new_url).content.decode('utf-8')
    # print(res)
    print('-'*100)
    # 2.挨个的访问每一个链接，得到对应html文档
    #文档传入bs4中，使用遍历文档树的方式抓取标题，题目，分析
    # 标题 h1
    # 题目 第二p的文本
    # 分析 第三p的文本
    soup=bs4.BeautifulSoup(res,"html.parser")
    h1=soup.div.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.h1
    print(h1.text)
    if h1.next_sibling.name=='p':
        p2=h1.next_sibling.next_sibling.next_sibling
    else:
        p2 = h1.next_sibling.next_sibling.next_sibling.next_sibling
    print(p2.text)
    p3 = p2.next_sibling.next_sibling
    print(p3.text)
    # 3.把得到的数据，写入文本中
    with open('python.txt','a',encoding='utf-8') as f:
        f.write(h1.text+'\n')
        f.write(p2.text + '\n')
        f.write(p3.text + '\n')
        f.write('-'*100 + '\n')

第二种lxml方法

# # coding:utf-8
import  requests
from  lxml import  etree
# import  pymysql
import  chardet
import xlsxwriter
# #获取单页面
def get_one_page(url):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36','Connection': 'close'}
    response=requests.get(url,headers=headers)
    response.encoding=chardet.detect(response.content)['encoding']
    return response.text
def parse_one_page(html):
    #初始化
    result=etree.HTML(html)
    item={}
    j=1
    j=j+1
    item['a1']=result.xpath("//*[@id='content']/ul/li['j']/a/@href")
    for i in item['a1']:
        yield 'http://www.runoob.com/python/'+str(i)
#第二个详细页面
def parse_two_page(html):
    result = etree.HTML(html)
    item2 = {}
    item2['b1']=result.xpath('//*[@id="content"]/p[2]/text()')#题目
    yield  item2['b1']
def write_to_excell(i,file):
    pass
url='http://www.runoob.com/python3/python3-examples.html'
html=get_one_page(url)
for i in parse_one_page(html):
    print(i)
    html = get_one_page(i)
    for i in parse_two_page(html):
        print(i)