import requests
from bs4 import BeautifulSoup
import re
import os
import datetime
#首先通过chrome获得headers,包括user-agent和cookie
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Cookie': 'tips=1; V7Qq_2132_smile=1D1; nodeId=_all; nodeType=1; V7Qq_2132_saltkey=jE3E1veZ; V7Qq_2132_lastvisit=1540778092; warning=1; TimeOut=-1; LoginTime=1541143580000; ShowAlert=0; ShowTimeOut=0; V7Qq_2132_editormode_e=1; V7Qq_2132_ulastactivity=be5f8Jd10mhLpV65dxvLP0K1%2BKlfS7mfpC74GPDlXhH0gJSJoiAR; V7Qq_2132_forum_lastvisit=D_36_1541556325; V7Qq_2132_visitedfid=48D36D46D131D53D42D45D39D37D40; PD_STATEFUL_e2a6d2b4-9490-11e6-952f-d8d385a4d314=%2FMultiSeUn; logintype=4; vfCode=uDYGxC; tivoli_loginname=zhanglei1; auth_flag=true; PD-H-SESSION-ID=4_mVnmIh2aciCM3p83ix2dAy5ASdFUA+eErAe8HfXym7XLVZJX; loged=4_mVnmIh2aciCM3p83ix2dAy5ASdFUA+eErAe8HfXym7XLVZJX; messageStr=%u4E0A%u6B21%u767B%u5F55%u65F6%u95F4%uFF1A%3Cfont%20color%u7B49%u4E8E%27yellow%27%3E2018-11-07%2019%3A18%3A22%3C/font%3E%uFF0CIP%uFF1A%3Cfont%20color%u7B49%u4E8E%27yellow%27%3E10.7.31.170%3C/font%3E; goto_url=/; AMWEBJCT!%2FportalserverU!JSESSIONID=0000Yj0K3_CQF9zfidzS0LuSFyw:1a4c3h4fl; ltpacreatetime=1541637238384; acc_valid=1; AMWEBJCT!%2Fportalwas1!JSESSIONID=0000GGz5NFSVoynZVO4PiccMllM:1a3qoeddj; mssPortalPreLogin=0; PD_STATEFUL_a2d06bb2-9f14-11e1-9654-0050561100ae=%2Fappserver; AMWEBJCT!%2FIDAP!JSESSIONID=E802784C0A2987E82501D76C1008410E; Coremail.sid=BAKDKuBBUHYtxYGcJWBBXFTiAMtOESTx; updatesessiontime=1541642831; AMWEBJCT!%2Fportalserver!JSESSIONID=0000Dqyy57wnLWYjUcskCGp_-PW:14q3sh66a; LtpaToken=x5qkJ7PIdJ36dd3xp+WPwGG8KyvONhpP6LUAK5mJKm6q+7vewMmlzZsUNch+tED1xN8hjrf6JeJ/mP+G7jlYr4VpPYwLf6FW2ZnHCndRB0MaZIpEGUmWZRWwaoI5cs/42A+/QIWYCFJpn7L2RJ34eYoQoHNVwr5oWXkbFGArfUWlPjf1p+rEXhk8lAjWHxpHMR500Colf3GTIKKQoIqIwW1AwjsbFuK0SfGzuEh8WI3Iy3VCcxBo8vTEMOHOh4DHJhrJ6esQzRVszXNesWgOP5f1hl/AfBrPbbgNEnuupUj0cxT+PKIUKj0x7uIYM6PQC9h19EnprymCc6dAF0vZxmMnaYeAVfWz; AMWEBJCT!%2Fportalserver!JSESSIONID2=0000HvsQzC2kC1VsMmrl9OZqLjI:14q3sgu18; MssSsoToken=QW6uLAypiih/mW33jw2kbkF2L1vA6RZjaBVUrTGH/gA=; AMWEBJCT!%2Fpwas_virtual!JSESSIONID=0000xtpzPuILdJxOu3r2w2rAxoT:1amslq7b0; AMWEBJCT!%2Fappserver!JSESSIONID=0000TRD1aVMFw3IVSfIj1aKqRDw:16saotmp4'
}
#爬取员工的照片并以他的名字导出
def single_agent(agent_id,agent_py,virtualType,part_Name,part_ID):
url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doViewLayer&id="+agent_id+"&isVirtal=no&zygzh="+agent_py+"&ZDWID="+part_ID+"&virtualType="+virtualType
# print(url)
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
staff_detail = []
for td in soup.select('td'):
staff_detail.append(td.text.strip())
# print(staff_detail)
agent_name = staff_detail[0]
agent_gonghao = staff_detail[3]
# print(agent_name,agent_gonghao)
pic_url = soup.find(name='img',attrs={'width':'100'})['src']
pic_html = requests.get(url=pic_url,headers=headers)
file_path = os.path.abspath('.\\上海电信员工照片\\'+'{}/{}.{}'.format(part_Name,agent_name,'jpg'))
if os.path.exists(file_path): #考虑到有可能会有同名同姓的情况,所以如果出现重复就加上工号
file_path = os.path.abspath('.\\上海电信员工照片\\'+'{}/{}.{}'.format(part_Name,agent_name+'-'+agent_gonghao,'jpg'))
with open(file_path,'wb') as f:
f.write(pic_html.content)
else:
with open(file_path,'wb') as f:
f.write(pic_html.content)
#获取该部门通讯录的最大页数
def Part_List(part_Name,part_ID):
url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID="+part_ID+"¤tPage=1&orderIndex=&orderSign=1&str=all&isVirtal=no"
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
totalPage = int(soup.find(name='input',attrs={'name':'totalPage'})['value'])
#通过正则表达式获得该部门的总人数
temp_renshu = soup.find(name='b',attrs={'f-fam1'}).string
if temp_renshu == '没有找到记录.':
print("部门:{}没有找到记录".format(part_Name))
return temp_renshu
else:
renshu = int(re.search('\D\D(\d+)\D',temp_renshu).group(1))
print("正在爬取部门:{}{}人的照片".format(part_Name, renshu))
return totalPage
#获取每一页中每个员工的id、姓名拼音和是否虚拟(交流之类)
def get_Agent(totalPage, part_ID):
for j in range(totalPage+1):
url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID="+part_ID+"¤tPage="+str(j)+"&orderIndex=&orderSign=1&str=all&isVirtal=no"
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
agent_clickView = soup.find_all(name='div',attrs={'style':'cursor: pointer;'})
for i in range(len(agent_clickView)):
clickView = str(agent_clickView[i]).split("'")
agent_id = clickView[1]
agent_py = clickView[3]
virtualType = clickView[5]
single_agent(agent_id,agent_py,virtualType,part_Name,part_ID)
#获得每一个三级部门的部门名称和部门id(如浦东电信局)
starttime = datetime.datetime.now()
url = 'http://www.sh.ctc.com/CompanyAddressListNew/deptCustom.do?method=loadMenuData'
response = requests.get(url, headers=headers)
json = response.json()
for i in json:
if i.get('jt_code'):
part_ID = i.get('jt_code')
part_Name = i.get('name')
part_Name = part_Name.replace('/','-')
path = os.path.abspath('.\\上海电信员工照片\\'+part_Name)
if not os.path.exists(path):
os.mkdir(path)
# print(part_ID,part_Name)
temp_renshu = Part_List(part_Name, part_ID)
# print(temp_renshu)
if temp_renshu == '没有找到记录.':
os.remove(path)
continue
else:
totalPage = temp_renshu
get_Agent(totalPage, part_ID)
else:
continue
endtime = datetime.datetime.now()
total_time = (endtime - starttime).seconds
print("员工照片爬取完毕,总共耗时{}秒".format(total_time))
爬虫第三天:爬内部员工的照片
最新推荐文章于 2024-09-27 09:30:21 发布