环境需求:
Python 3.6及以上
包要求:
import json
import csv
import time
from datetime import datetime
import urllib.request
from itertools import groupby
使用步骤:
如果windows上装好python环境,可以直接运行该文件。另外也可以在ide中运行。运行后会出现start time 此时输入格式为:2018-10-9这样的格式。比如说统计2018年12月1日到2019年1月9日的代码。start time为起始时间,此时start time填写2018-12-1(注意起始时间比结束时间早,如果输入的起始时间晚于结束时间的话就会报错)时间格式如下图:
如果结束时间不是此时此刻(比如说今天是1.11日,endtime写的是1.09)此时就会出现此时间不在爬取范围,请稍等字样。属于正常现象。当爬取到数据后会显示正在爬取数据字样。
最终结果样式展示:
文件展示:
注意:
在重新爬取前,应该把f文件删除掉。因为数据写入方式为追加,如果不删除该文件会继续在原有数据基础上写入。\
源码:
import json
import csv
import time
from datetime import datetime
import urllib.request
from itertools import groupby
def data_writer(row):
"""
数据写入csv文件
"""
with open('f.csv', 'a', encoding='utf-8', newline='') as f:
csv_out = csv.writer(f)
csv_out.writerow(['Project', 'owner', 'Inserted Size', 'Deleted Size'])
for data in row:
csv_out.writerow(data)
def str_to_date(str_date):
"""
字符串转为时间格式
"""
date_obj = datetime.strptime(str_date, '%Y-%m-%d').date()
return date_obj
def spider(url_base):
"""
爬虫
"""
start_time = str_to_date(input("start time(2018-10-9):"))
end_time = str_to_date(input("end_time(2018-10-9):"))
print(start_time, end_time)
row = []
i = 0
while True:
try:
response = urllib.request.urlopen(url_base + str(i))
page = (int(i) / 25) + 1
print("第%s页--------------------------------------------------------------------" % page)
s = json.loads(response.read()[4:-1], encoding='utf-8')
for item in s:
realtime = str_to_date(item['updated'].split(' ')[0])
if realtime > end_time:
print('此时间不再爬取范围,请稍等------------------------------------------', realtime)
continue
elif start_time <= realtime:
if item['branch'] == "master":
row.append((item['project'].split('/')[0], item['owner']['name'], item['insertions'],
item['deletions']))
print('正在爬数据----------------------------', realtime)
elif start_time > realtime:
print('马上结束了即将写数据-----------------------------------------------')
return row
i += 25
time.sleep(3)
except:
print('over')
break
# return row
def clean_data(row):
"""
清洗数据
"""
row.sort()
data_list = [{key: list(value_iter)} for key, value_iter in
groupby(row, lambda data: data[0])]
datas = []
for data in data_list:
for key, values in data.items():
name_groups = [list(value_iter) for key, value_iter in groupby(values, lambda data: data[1])]
print(name_groups)
for name_group in name_groups:
insertions = sum([value[2] for value in name_group])
deletions = sum([value[3] for value in name_group])
datas.append((name_group[0][0], name_group[0][1], insertions, deletions))
return datas
if __name__ == '__main__':
url_base = 'https://gerrit.onap.org/r/changes/?q=status:merged&n=25&O=81&S='
row_data = spider(url_base)
data = clean_data(row_data)
data_writer(data)
————————————————————————————————————————————————————————V1版本:
# week number 需要加1
import json
import csv
import time
from datetime import datetime
import urllib.request
from itertools import groupby
def GetNowTime(): # 获取当前时间并以年月日时间方式显示
return time.strftime("%y%m%d%H", time.localtime(time.time()))
def data_writer(row, date):
"""
数据写入csv文件
"""
time = GetNowTime()
fname1 = time + ".csv"
print(fname1)
with open(fname1, 'a', encoding='utf-8', newline='') as f:
csv_out = csv.writer(f)
csv_out.writerow(['owner', 'Inserted Size', 'Deleted Size', 'date', 'weeks', date+"_commits"])
for data in row:
csv_out.writerow(data)
def str_to_date(str_date):
"""
字符串转为时间格式
"""
date_obj = datetime.strptime(str_date, '%Y-%m-%d').date()
return date_obj
def spider(url_base):
"""
爬虫
"""
print(start_time, end_time)
row = []
i = 0
while True:
try:
response = urllib.request.urlopen(url_base + str(i))
page = (int(i) / 25) + 1
print("第%s页--------------------------------------------------------------------" % page)
s = json.loads(response.read()[4:-1], encoding='utf-8')
for item in s:
date = item['updated'].split(' ')[0]
realtime = str_to_date(date)
if realtime > end_time:
print('此时间不再爬取范围,请稍等------------------------------------------', realtime)
continue
elif start_time <= realtime:
if item['branch'] == "master":
name = item['owner']['name']
insertions = item['insertions']
deletions = item['deletions']
row.append((name, insertions, deletions, date))
print('正在爬数据----------------------------', realtime)
elif start_time > realtime:
print('马上结束了即将写数据-----------------------------------------------')
return row
i += 25
except:
print('over')
break
# return row
def clean_data(row, date_arg):
"""
清洗数据
"""
row.sort()
# data_list: 按name对数据进行排序
data_list = [{key: list(value_iter)} for key, value_iter in
groupby(row, lambda data: data[0])]
datas = []
for data in data_list:
for key, values in data.items():
if date_arg == 'day':
func = lambda value: str_to_date(value[3]).day
elif date_arg == 'week':
func = lambda value: str_to_date(value[3]).isocalendar()[1]
# func = lambda value: str(str_to_date(value[3]).year) + '-' + str(
# str_to_date(value[3]).isocalendar()[1])
elif date_arg == 'month':
func = lambda value: str_to_date(value[3]).month
date_datas = group_date(key, data, date_arg, func)
datas.extend(date_datas)
return datas
def group_date(key, data, date_arg, func):
"""
按照day、week、month 进行排序
"""
date_sorted = sorted(data[key], key=func, reverse=True)
date_groups = [list(value_iter) for key, value_iter in groupby(date_sorted, func)]
date_datas = []
for date_group in date_groups:
get_date = str_to_date(date_group[0][3])
if date_arg == 'day':
date = get_date
elif date_arg == 'week':
# date = get_date.isocalendar()[1]
# date = get_date.strftime('%Y-%U')
date = str(get_date.strftime('%Y-%U')).split("-")[0]
weeks = int(str(get_date.strftime('%Y-%U')).split("-")[1])+1
# print(int(str(get_date.strftime('%Y-%U')).split("-")[1]))
# print(weeks)
#date = get_date
elif date_arg == 'month':
date = get_date.strftime('%Y-%m')
name = date_group[0][0]
insertions = sum([value[1] for value in date_group])
deletions = sum([value[2] for value in date_group])
day_commits = len(date_group)
out_day_data = (name, insertions, deletions, date, weeks, day_commits)
date_datas.append(out_day_data)
return date_datas
if __name__ == '__main__':
# url_base = 'https://gerrit.onap.org/r/changes/?q=status:merged&n=25&O=81&S='
url_base = 'https://gerrit.acumos.org/r/changes/?q=status:merged&n=25&O=81&S='
start_time = str_to_date(input("start time(2018-10-9):"))
end_time = str_to_date(input("end_time(2018-10-9):"))
# start_time = str_to_date("2019-2-20")
# end_time = str_to_date("2019-4-22")
date_arg = input("please input time parameters:(eg: month、day、week)")
row_data = spider(url_base)
data = clean_data(row_data, date_arg)
data_writer(data, date_arg)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
V2版本:
import json
import csv
import time
from datetime import datetime
import urllib.request
from itertools import groupby
def GetNowTime(): # 获取当前时间并以年月日时间方式显示
return time.strftime("%y%m%d%H", time.localtime(time.time()))
def data_writer(row, date):
"""
数据写入csv文件
"""
time = GetNowTime()
fname1 = time + ".csv"
print(fname1)
with open(fname1, 'a', encoding='utf-8', newline='') as f:
csv_out = csv.writer(f)
csv_out.writerow(['Inserted Size', 'Deleted Size', 'date', 'weeks', date+"_commits"])
for data in row:
csv_out.writerow(data)
def str_to_date(str_date):
"""
字符串转为时间格式
"""
date_obj = datetime.strptime(str_date, '%Y-%m-%d').date()
return date_obj
def spider(url_base):
"""
爬虫
"""
print(start_time, end_time)
row = []
i = 0
while True:
try:
response = urllib.request.urlopen(url_base + str(i))
page = (int(i) / 25) + 1
print("第%s页--------------------------------------------------------------------" % page)
s = json.loads(response.read()[4:-1], encoding='utf-8')
for item in s:
date = item['updated'].split(' ')[0]
realtime = str_to_date(date)
if realtime > end_time:
print('此时间不再爬取范围,请稍等------------------------------------------', realtime)
continue
elif start_time <= realtime:
if item['branch'] == "master":
# name = item['owner']['name']
insertions = item['insertions']
deletions = item['deletions']
row.append((insertions, deletions, date))
print('正在爬数据----------------------------', realtime)
elif start_time > realtime:
print('马上结束了即将写数据-----------------------------------------------')
return row
i += 25
except:
print('over')
break
# return row
def clean_data(row, date_arg):
"""
清洗数据
"""
datas = []
if date_arg == 'day':
func = lambda value: str_to_date(value[2])
elif date_arg == 'week':
func = lambda value: str(str_to_date(value[2]).year)+'-'+str(str_to_date(value[2]).isocalendar()[1])
elif date_arg == 'month':
func = lambda value: str_to_date(value[2]).strftime('%Y-%m')
date_datas = group_date(row, date_arg, func)
datas.extend(date_datas)
return datas
def group_date(data, date_arg, func):
"""
按照day、week、month 进行排序
"""
date_sorted = sorted(data, key=func, reverse=True)
date_groups = [{key: list(value_iter)} for key, value_iter in groupby(date_sorted, func)]
date_datas = []
for date_group_dict in date_groups:
for date, date_group in date_group_dict.items():
date_time = ''
get_date = str_to_date(date_group[0][2])
if date_arg == 'day':
date_time = date
elif date_arg == 'week':
str(date).split("-")
date_time = str(date).split("-")[0]
weeks = str(date).split("-")[1]
print(type(date_time))
elif date_arg == 'month':
date_time = str(date)
# name = date_group[0][0]
insertions = sum([value[0] for value in date_group])
deletions = sum([value[1] for value in date_group])
day_commits = len(date_group)
out_day_data = (insertions, deletions, date_time, weeks, day_commits)
date_datas.append(out_day_data)
return date_datas
if __name__ == '__main__':
url_base = 'https://gerrit.acumos.org/r/changes/?q=status:merged&n=25&O=81&S='
start_time = str_to_date(input("start time(2018-10-9):"))
end_time = str_to_date(input("end_time(2018-10-9):"))
# start_time = str_to_date("2019-04-18")
# end_time = str_to_date("2019-04-22")
date_arg = input("please input time parameters:(eg: month、day、week)")
row_data = spider(url_base)
data = clean_data(row_data, date_arg)
data_writer(data, date_arg)
修复json数据格式混乱,email无法获取bug:
import json
import csv
import time
from datetime import datetime
import urllib.request
from itertools import groupby
def GetNowTime():#获取当前时间并以年月日时间方式显示
return time.strftime("%y%m%d%H", time.localtime(time.time()))
def data_writer(row):
"""
数据写入csv文件
"""
time = GetNowTime()
fname1 = time + ".csv"
print(fname1)
with open(fname1, 'a', encoding='utf-8', newline='') as f:
csv_out = csv.writer(f)
csv_out.writerow(['Project', 'owner', 'email', 'Inserted Size', 'Deleted Size', 'commits'])
for data in row:
csv_out.writerow(data)
def str_to_date(str_date):
"""
字符串转为时间格式
"""
date_obj = datetime.strptime(str_date, '%Y-%m-%d').date()
return date_obj
def spider(url_base):
"""
爬虫
"""
print(start_time, end_time)
row = []
i = 0
while True:
try:
response = urllib.request.urlopen(url_base + str(i))
page = (int(i) / 25) + 1
print("第%s页--------------------------------------------------------------------" % page)
s = json.loads(response.read()[4:-1], encoding='utf-8')
for item in s:
realtime = str_to_date(item['updated'].split(' ')[0])
# print(realtime)
# if start_time <= realtime and realtime <= end_time:
# if realtime >= start_time and realtime <= end_time:
if realtime > end_time:
print('此时间不再爬取范围,请稍等------------------------------------------', realtime)
continue
elif start_time <= realtime:
if item['branch'] == "master":
email = item.get("owner", {}).get("email", "")
# company = email.split("@")[1] if email else ''
# companys = company.split(".")[0] if email else ''
# companys = ".".join(email.split('@')[-1].split(".")[0:-1])
row.append(
(
item.get("project", '').split("/")[0],
item['owner']['name'],
email,
item.get("insertions", ''),
item.get('deletions', ''),
)
)
# row.append((item['project'].split('/')[0], item['owner']['name'], item['owner']['email'],
# item['insertions'],
# item['deletions']))
print('正在爬数据----------------------------', realtime)
elif start_time > realtime:
print('马上结束了即将写数据-----------------------------------------------')
return row
i += 25
except:
print('over')
break
# return row
def clean_data(row):
"""
清洗数据
"""
row.sort()
data_list = [{key: list(value_iter)} for key, value_iter in
groupby(row, lambda data: data[0])]
datas = []
for data in data_list:
for key, values in data.items():
name_groups = [list(value_iter) for key, value_iter in groupby(values, lambda data: data[1])]
print(name_groups)
for name_group in name_groups:
insertions = sum([value[3] for value in name_group])
deletions = sum([value[4] for value in name_group])
commits = len(name_group)
datas.append((name_group[0][0], name_group[0][1], name_group[0][2], insertions, deletions, commits))
return datas
if __name__ == '__main__':
# url_base = 'https://gerrit.acumos.org/r/changes/?q=status:merged&n=25&O=81&S='
url_base = 'https://gerrit.onap.org/r/changes/?q=status:merged&n=25&O=81&S='
start_time = str_to_date(input("start time(2018-10-9):"))
end_time = str_to_date(input("end_time(2018-10-9):"))
row_data = spider(url_base)
data = clean_data(row_data)
data_writer(data)
gerit_company:
import json
import csv
import time
import re
from datetime import datetime
import urllib.request
from itertools import groupby
def GetNowTime(): # 获取当前时间并以年月日时间方式显示
return time.strftime("%y%m%d%H", time.localtime(time.time()))
def data_writer(row):
"""
数据写入csv文件
"""
time = GetNowTime()
fname1 = time + ".csv"
print(fname1)
with open(fname1, "a", encoding="utf-8", newline="") as f:
csv_out = csv.writer(f)
csv_out.writerow(
["Project", "company", "Inserted Size", "Deleted Size", "commits"]
)
for data in row:
csv_out.writerow(data)
def str_to_date(str_date):
"""
字符串转为时间格式
"""
date_obj = datetime.strptime(str_date, "%Y-%m-%d").date()
return date_obj
def spider(url_base):
"""
爬虫
"""
print(start_time, end_time)
row = []
i = 0
while True:
try:
response = urllib.request.urlopen(url_base + str(i))
page = (int(i) / 25) + 1
print(
"第%s页--------------------------------------------------------------------"
% page
)
###############################################################################################
data = str(response.read(), encoding="utf8")
data = data.split("[")
data.pop(0)
s = "[" + "[".join(data)
# s = json.loads(response.read()[4:-1], encoding='utf-8')
s = json.loads(s)
###############################################################################################
for item in s:
realtime = str_to_date(item["updated"].split(" ")[0])
if realtime > end_time:
print(
"此时间不再爬取范围,请稍等------------------------------------------",
realtime,
)
continue
elif start_time <= realtime:
if item["branch"] == "master":
###############################################################################################
email = item.get("owner", {}).get("email", "")
company = email.split("@")[1] if email else ''
companys = company.split(".")[0] if email else ''
# companys = ".".join(email.split('@')[-1].split(".")[0:-1])
row.append(
(
item.get("project",'').split("/")[0],
companys,
item.get("insertions", ''),
item.get('deletions', ''),
)
)
###############################################################################################
print("正在爬数据----------------------------", realtime)
elif start_time > realtime:
print("马上结束了即将写数据-----------------------------------------------")
return row
i += 25
except Exception as e:
print("over")
break
# return row
def clean_data(row):
"""
清洗数据
"""
print("row----------------------------:", row)
row.sort()
data_list = [
{key: list(value_iter)}
for key, value_iter in groupby(row, lambda data: data[0])
]
datas = []
for data in data_list:
for key, values in data.items():
name_groups = [
list(value_iter)
for key, value_iter in groupby(values, lambda data: data[1])
]
print(name_groups)
for name_group in name_groups:
insertions = sum([value[2] for value in name_group])
deletions = sum([value[3] for value in name_group])
commits = len(name_group)
datas.append(
(name_group[0][0], name_group[0][1], insertions, deletions, commits)
)
return datas
if __name__ == "__main__":
# url_base = 'https://gerrit.acumos.org/r/changes/?q=status:merged&n=25&O=81&S='
url_base = "https://gerrit.onap.org/r/changes/?q=status:merged&n=25&O=81&S="
start_time = str_to_date("2019-6-24")
end_time = str_to_date("2019-6-26")
# start_time = str_to_date(input("start time(2018-10-9):"))
# end_time = str_to_date(input("end_time(2018-10-9):"))
row_data = spider(url_base)
data = clean_data(row_data)
data_writer(data)