爬虫框架2(BeautifulSoup解析网页)

最新推荐文章于 2024-04-09 17:04:35 发布

厄运鹰人

最新推荐文章于 2024-04-09 17:04:35 发布

阅读量124

点赞数

分类专栏：爬虫学习文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_27166635/article/details/120336424

版权

爬虫学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

列表数据翻页

# -*- coding:utf-8 -*-
#@Time : 2020/6/2 0002 15:04
#@Author: Yang-Zhenping
#@File : signal_spider.py
import requests
import json
from bs4 import BeautifulSoup
def get_url(html):
    title_=[]
    url_list=[]
    res=requests.get(html)
    res.encoding='GBK'
    soup=BeautifulSoup(res.text,'html.parser')


    for urls in soup.find_all("a",class_="f14_000000"):
        url=urls.get('href')
        if url==" ":
            continue
        if url not in url_list:
                url_list.append('http://www.china.com.cn/aboutchina/zhuanti/daizu/'+url)
        else:
            continue
    #
    # for l in tag.find_all("li"):
    #     urls=l.find_all('a')
    #     for href in urls:
    #         url=href.get('href')
    #
    #         if url ==' ':
    #           continue
    #         if url not in url_list:
    #             url_list.append('http://www.minwang.com.cn/'+url)
    #         else:
    #             continue

    return url_list
def get_data(url):
    dic={}
    contents=[]
    content=""
    jpg_src=[]
    # title=[]
    file=open('./傣族/节庆.json','a+',encoding='utf-8')
    res=requests.get(url)
    res.encoding='UTF-8'
    soup=BeautifulSoup(res.text,'html.parser')
    title=soup.find("td",class_="fb24").get_text()

    for text in soup.find_all('td',class_='f14_000000'):
        # title1=text.find('h2').get_text()
        # title.append(title1)

        for c in text.find_all('p'):
            x = str(c.get_text()).strip()
            content += x
            # if len(content)<12:
            #     images = c.find_all('img')
            #     if images == []:
            #         continue
            #     else:

                    # for img in images:
                    #     img = img.get("src")
                    # img_link = 'http://www.minwang.com.cn' + img
                    # jpg_src.append(img_link)

    contents.append(content)



    dic['Title']=title
    dic['Content']=contents
    # dic['img_url']=jpg_src
    print(dic)
    # file.write(str(dic)+','+'\n')

    file.write(json.dumps(dict(dic), ensure_ascii=False) +","+'\n')
if __name__ == '__main__':
    j=0
    for i in range(1,2):
        # html='http://www.minwang.com.cn/mzwhzyk/663688/686015/686021/b1927c27-'+str(i)+'.html'
        html='http://www.china.com.cn/aboutchina/zhuanti/daizu/node_7067670.htm'
        url = get_url(html)
        for x in range(len(url)):
            url = get_url(html)[x]
            j=j+1
            print(url)
            print(j)
            get_data(url)

厄运鹰人

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬虫框架2(BeautifulSoup解析网页)

列表数据翻页# -*- coding:utf-8 -*-#@Time : 2020/6/2 0002 15:04#@Author: Yang-Zhenping#@File : signal_spider.pyimport requestsimport jsonfrom bs4 import BeautifulSoupdef get_url(html): title_=[] url_list=[] res=requests.get(html) res.encod
复制链接

扫一扫

专栏目录