列表数据翻页
# -*- coding:utf-8 -*-
#@Time : 2020/6/2 0002 15:04
#@Author: Yang-Zhenping
#@File : signal_spider.py
import requests
import json
from bs4 import BeautifulSoup
def get_url(html):
title_=[]
url_list=[]
res=requests.get(html)
res.encoding='GBK'
soup=BeautifulSoup(res.text,'html.parser')
for urls in soup.find_all("a",class_="f14_000000"):
url=urls.get('href')
if url==" ":
continue
if url not in url_list:
url_list.append('http://www.china.com.cn/aboutchina/zhuanti/daizu/'+url)
else:
continue
#
# for l in tag.find_all("li"):
# urls=l.find_all('a')
# for href in urls:
# url=href.get('href')
#
# if url ==' ':
# continue
# if url not in url_list:
# url_list.append('http://www.minwang.com.cn/'+url)
# else:
# continue
return url_list
def get_data(url):
dic={}
contents=[]
content=""
jpg_src=[]
# title=[]
file=open('./傣族/节庆.json','a+',encoding='utf-8')
res=requests.get(url)
res.encoding='UTF-8'
soup=BeautifulSoup(res.text,'html.parser')
title=soup.find("td",class_="fb24").get_text()
for text in soup.find_all('td',class_='f14_000000'):
# title1=text.find('h2').get_text()
# title.append(title1)
for c in text.find_all('p'):
x = str(c.get_text()).strip()
content += x
# if len(content)<12:
# images = c.find_all('img')
# if images == []:
# continue
# else:
# for img in images:
# img = img.get("src")
# img_link = 'http://www.minwang.com.cn' + img
# jpg_src.append(img_link)
contents.append(content)
dic['Title']=title
dic['Content']=contents
# dic['img_url']=jpg_src
print(dic)
# file.write(str(dic)+','+'\n')
file.write(json.dumps(dict(dic), ensure_ascii=False) +","+'\n')
if __name__ == '__main__':
j=0
for i in range(1,2):
# html='http://www.minwang.com.cn/mzwhzyk/663688/686015/686021/b1927c27-'+str(i)+'.html'
html='http://www.china.com.cn/aboutchina/zhuanti/daizu/node_7067670.htm'
url = get_url(html)
for x in range(len(url)):
url = get_url(html)[x]
j=j+1
print(url)
print(j)
get_data(url)