1 批量采集图片
#采集多页URL图片
#特点是页面下面是1234567导航,样本为故宫壁纸
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Connection': 'keep-alive',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
url_path = 'https://www.dpm.org.cn/lights/royal/p/' #
url_all=[]
j = 0
for i in range(2):
print(i)
url = url_path + str(i+1)+'.html'
data = requests.get(url,headers=header).content.decode('utf8')
soup = BeautifulSoup(data,'html.parser')
urldiv = soup.find_all('div',{'class':'pic'})
for urltemp in urldiv:
url_find = urltemp.img['src']
url_all.append(url_find)
j = j+1
new_url = url_find[:4]+url_find[5:]
print(url_find)
imgs = requests.get(new_url,headers=header)
file_name="E://Tmp/Bigdata/"+str(j)+".jpg"
print('开始保存图片')
f = open(file_name, 'ab')
f.write(imgs.content)
print(file_name, '图片保存成功!')
f.close()
2 统计发帖人
# 统计作者发帖数量
import pandas as pd
from pandas import DataFrame
df=pd.read_excel('E:/Tmp/Bigdata/new0730.xlsx') #读取本地表格
data=df.loc[:,'news_author'].values #读取作者信息
i=0
author_list = []
for authors in data:
author = authors.strip()
author_temp = author.split(' ') # 字符串分割
author_list.extend(author_temp)
author_set = set(author_list) #去掉重复的人名
news_num = []
author_name = []
for item in author_set:
news_num.append(author_list.count(item)) #统计人名
author_name.append(item)
result_temp = {'author_name':author_name,
'news_num':news_num}
result_datafram = DataFrame(result_temp)
result_datafram.to_excel('E:/Tmp/Bigdata/author.xlsx')
3 统计并用饼图
#统计并用饼图绘制
import pandas as pd
import matplotlib.pyplot as plt
df=pd.read_excel('E:/Tmp/Bigdata/new0730.xlsx')
data=df.loc[:,'news_author'].values
author_num = []
for authors in data:
author = authors.strip()
author_temp = author.split(' ')
author_num.append(len(author_temp))
author_set = set(author_num)
author_bili = []
author_hezuo = []
for item in author_set:
author_bili.append(author_num.count(item))
author_hezuo.append(item)
plt.pie(x = author_bili, labels = author_hezuo) #绘制饼图
4 豆瓣读书排行榜
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 31 16:04:27 2019
@author: Administrator
"""
import pandas as pd #用来存储excel表格
import requests
import time #做一只友好的爬虫
from bs4 import BeautifulSoup
titlelist=[] #创建一个空列表,用来存放抓取回来的书名
scorelist=[] #创建一个空列表,用来存放抓取回来的评分
dict1={'shuming':titlelist,'pingfen':scorelist} #创建字典,存放数据
for N in range(1,2):
url='https://book.douban.com/top250?start='+str(25*(N-1)) #注意网址变化规律
data=requests.get(url).content.decode('utf8')
soup=BeautifulSoup(data,'html.parser')
contentdiv=soup.find('div',{'class':'article'})
content=contentdiv.find_all('div',{'class':'pl2'}) #抓取书名
content2=contentdiv.find_all('span',{'class':'rating_nums'}) #抓取评分
for i in range(len(content)): #循环遍历content里面包含的所有书名 循环次数为len(contnet)次
title=content[i].find('a').get('title') #每次循环获取content中的第i个书名 存到title这个变量里
titlelist.append(title)#把书名添加到列表中
for i in range(len(content2)): #循环遍历content2里面包含的所有书名 循环次数为len(contnet2)次
score=content2[i].text #每次循环获取content2中的第i个书名 存到score这个变量里
scorelist.append(score)#把评分添加到列表中
time.sleep(5) #友好的爬虫
df=pd.DataFrame(dict1)
df.to_excel('E://Tmp/Bigdata/douban.xlsx',index=True,header=True)
4 人民网英文版全网新闻
#人民网英文版全网新闻
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
#opinion
urlhead='http://en.people.cn'
klist=['90780','business','90786','90777','90882','90782','202936','90779','102775']
for k in range(len(klist)):
kd=klist[k]
linklist=[]
for page in range (1,3):#35
if page==1:
url='http://en.people.cn/'+kd+'/index.html'
else:
url='http://en.people.cn/'+kd+'/index'+str(page)+'.html'
try:
header={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'}
data=requests.get(url,headers=header).content.decode('utf-8')
soup=BeautifulSoup(data,'html.parser')
content=soup.find_all('div',{'class':'d2_17 clear'})
for i in range(0,len(content)):
link=content[i].find('a')
link=str(link)
link=link[link.index('"')+1:link.index('" t')]
link=urlhead+link
print('link',link)
linklist.append(link)
print('-------')
print('linklist',linklist)
time.sleep(6)
except Exception as e:
print('异常:',e)
print(page)
df=pd.DataFrame(linklist)
df.to_csv('人民日报'+kd+'.csv',index=False,header=True)
5 完整爬虫3---微博---微博广场关键词搜索结果抓取