爬虫框架2(BeautifulSoup解析网页)

列表数据翻页

# -*- coding:utf-8 -*-
#@Time : 2020/6/2 0002 15:04
#@Author: Yang-Zhenping
#@File : signal_spider.py
import requests
import json
from bs4 import BeautifulSoup
def get_url(html):
    title_=[]
    url_list=[]
    res=requests.get(html)
    res.encoding='GBK'
    soup=BeautifulSoup(res.text,'html.parser')


    for urls in soup.find_all("a",class_="f14_000000"):
        url=urls.get('href')
        if url==" ":
            continue
        if url not in url_list:
                url_list.append('http://www.china.com.cn/aboutchina/zhuanti/daizu/'+url)
        else:
            continue
    #
    # for l in tag.find_all("li"):
    #     urls=l.find_all('a')
    #     for href in urls:
    #         url=href.get('href')
    #
    #         if url ==' ':
    #           continue
    #         if url not in url_list:
    #             url_list.append('http://www.minwang.com.cn/'+url)
    #         else:
    #             continue

    return url_list
def get_data(url):
    dic={}
    contents=[]
    content=""
    jpg_src=[]
    # title=[]
    file=open('./傣族/节庆.json','a+',encoding='utf-8')
    res=requests.get(url)
    res.encoding='UTF-8'
    soup=BeautifulSoup(res.text,'html.parser')
    title=soup.find("td",class_="fb24").get_text()

    for text in soup.find_all('td',class_='f14_000000'):
        # title1=text.find('h2').get_text()
        # title.append(title1)

        for c in text.find_all('p'):
            x = str(c.get_text()).strip()
            content += x
            # if len(content)<12:
            #     images = c.find_all('img')
            #     if images == []:
            #         continue
            #     else:

                    # for img in images:
                    #     img = img.get("src")
                    # img_link = 'http://www.minwang.com.cn' + img
                    # jpg_src.append(img_link)

    contents.append(content)



    dic['Title']=title
    dic['Content']=contents
    # dic['img_url']=jpg_src
    print(dic)
    # file.write(str(dic)+','+'\n')

    file.write(json.dumps(dict(dic), ensure_ascii=False) +","+'\n')
if __name__ == '__main__':
    j=0
    for i in range(1,2):
        # html='http://www.minwang.com.cn/mzwhzyk/663688/686015/686021/b1927c27-'+str(i)+'.html'
        html='http://www.china.com.cn/aboutchina/zhuanti/daizu/node_7067670.htm'
        url = get_url(html)
        for x in range(len(url)):
            url = get_url(html)[x]
            j=j+1
            print(url)
            print(j)
            get_data(url)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值