库准备
首先导入 Python 库,requests 和 BeautifulSoup
import requests
from bs4 import BeautifulSoup
抓取接口 cookies 和 User-Agent
使用 cookies 和 User-Agent 绕过 Git 登录,直接进行数据抓取。
1.打开浏览器,进入 Git 登录界面,并 F12 打开开发者工具
2.登录成功后,查看 cookies 和 User-Agent
3.定义格式
获取到 cookies 和 user-Agent 后,按照对应格式将其赋值:
import requests
from bs4 import BeartifulSoup
headers = {'User-Agent':'User-Agent的值'}
cookies = {'cookie':'cookies的值'}
4.尝试抓取 Git 数据
url = 'https://gitlab.com/users/sign_in' # git 登录地址
req = requests.get(url, cookies=cookies, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser', from_encoding='utf-8')
print(soup)
输出结果:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-02a3eaa24db2bd1ed9b64450595fc2cf.css" integrity="sha512-hddDYPWR0gBbqLRmIZP242WMEiYsVkYI2UCYCVUHB4h5DhD2cbtFJYG+HPh21dZGb+sbgDHxQBNJCBq7YbmlBQ==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/site-7a15476fda05819d4cf0d3adc8724236.css" integrity="sha512-sJwab9nd8bgVMj8MpmRLq3N2korlw5O6ARpNZ8t5ZtoSkFZvDU1e8OKDDnoB4d/Nnu7tiY1ZGoE1gx/CdxKqYg==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/github-be4e45349cf088df7a6636f437c0a167.css" integrity="sha512-uhAd27cNiLn0VE2GVEVUN8D5zW0o7s0QTnCGMnJZkL2HqN9/LwHDi4ndTPJH0upUQHYl/8QF6cwbOYp/KIzlJQ==" media="all" rel="stylesheet">
<meta content="width=device-width" name="viewport"/>
<title>GitHub - Lizi0916/day02</title>
<meta content="Contribute to Lizi0916/day02 development by creating an account on GitHub." name="description"/>
<link href="/opensearch.xml" rel="search" title="GitHub" type="application/opensearchdescription+xml"/>
<link href="https://github.com/fluidicon.png" rel="fluid-icon" title="GitHub"/>
<meta content="1401488693436528" property="fb:app_id"/>
。。。。。。。
可以定位页面元素在输出结果中搜索,以确保已经拿到 Git 数据。
附代码:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'xxxxxx'}
cookies = {'cookie': 'xxxxxx'}
link_O = 'http://gitlabxxxxxx'
link_one = 'xxxxxx'
class GitDate(object):
def __init__(self):
pass
# 获取抓取数据连接
def verb(self):
'''
抓取 Git HTML 数据
a:控制抓取数据多少的变量
z:限制 a 到多少后停止抓取
:return: 返回要抓取的 href 链接该次链接内容的提交时间
'''
a = 0
z = 200
module = []
discard = []
for x in range(z):
if a <= z:
url = link_O + link_one + str(a)
req = requests.get(url, cookies=cookies, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser', from_encoding='utf-8')
a = a + 40
else:
break
string0 = ''
for lin in soup.find_all('li', attrs={'class': 'commit-header'}):
string0 = lin.text[:12]
for link in soup.find_all('a', attrs={'class': 'commit-row-message'}):
link_test = link.parent.parent.parent.parent.parent.parent.previous_sibling.previous_sibling
string1 = link_test.text[:12]
if link.text.startswith('M') is True or link.text.startswith('#ONE-410#') is True:
mod = link.get('href')
discard.append(mod)
elif string1 == string0 and link.get('href') not in discard:
discard.append(link.get)
mod = link.get('href')
module.append(mod)
module.append(string0)
elif link.get('href') not in discard:
mod = link.get('href')
module.append(mod)
module.append(string1)
return module
def obtain(self, old_list):
'''
通过 herf 链接列表获取提交的类与提交人和时间
:param old_list:
:return:
'''
list = []
for i in range(0, len(old_list), 2):
list.append(old_list[i])
list = sorted(set(list), key=list.index)
git_text = []
g = 1
for m in list:
url = 'http://gitlab.htzq.htsc.com.cn' + m
req = requests.get(url, cookies=cookies, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser', from_encoding='utf-8')
for link in soup.find_all('span', attrs={'class': 'commit-author-name'}):
b = soup.find('div', attrs={'class': 'commit-box content-block'}).text
x = b[2:]
if (link.text in ('xx', 'xx', 'xx', 'xx', 'xx)) and x.startswith(
'Merge') is True:
git_text.append('过滤数据')
else:
mod = link.text
git_text.append(mod)
for link1 in soup.find_all('div', attrs={'class': 'file-title'}):
mod = link1.text
if '.java' not in mod:
continue
elif mod in git_text and len(mod) > 10:
pass
else:
git_text.append(mod)
git_text.append(old_list[g])
g = g + 2
for i in range(len(git_text)):
git_text[i] = git_text[i].split('-')[-1]
return git_text
def git_analysis(self, list):
'''
针对列表数据做格式整理
:param list:
:return: 输出整理后的数据
'''
for i in range(len(list)):
if '.java' not in list[i]:
pass
else:
str_one = list[i].split('/')[-1]
str_two = str_one.split('.')[0]
list[i] = str_two
i = len(list) - 1
for x in range(len(list)):
if list[i] == list[i - 1]:
del list[i]
i = i - 1
return list
def date_end(self, list):
'''
整理列表格式,并输出
:param list:
:return:
'''
x = [0]
for i in range(len(list)):
if (('\u4e00' <= list[i] <= '\u9fff') is True) and list[i] == x[-1]:
continue
elif (('\u4e00' <= list[i] <= '\u9fff') is False) and list[i] == x[-1]:
continue
else:
x.append(list[i])
del x[0]
for i in range(len(x)):
x[i] = x[i].split('-')[-1]
wait = []
new_list = []
for i in x:
if ('\u4e00' <= i <= '\u9fff') is True and len(wait) == 0:
wait.append(i)
continue
elif (('\u4e00' <= i <= '\u9fff') is True and len(wait) > 0) or i.startswith('K'):
new_list.append(wait)
wait = []
wait.append(i)
continue
else:
wait.append(i)
continue
new_lists = []
for i in new_list:
if not i in new_lists:
new_lists.append(i)
else:
pass
return new_lists
if __name__ == "__main__": # 测试代码
git = GitDate().verb()
print(len(git))
print('one', git)
two = GitDate().obtain(git)
print('two', two)
three = GitDate().git_analysis(two)
print('three', three)
four = GitDate().date_end(three)
print('four', four)