from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def main():
datalist = getData()
savedbpath = 'job.db'
saveDbDate(datalist,savedbpath)
findLink = re.compile(r'"job_href":"(.*?)"')
findName = re.compile(r'"job_name":"(.*?)"')
findSalary = re.compile(r'"providesalary_text":"(.*?)"')
findAttribute = re.compile(r'"attribute_text":(.*?),"companysize_text":')
findCname = re.compile(r'"company_name":"(.*?)"')
findCtype = re.compile(r'"companytype_text":"(.*?)"')
findCsize = re.compile(r'"companysize_text":"(.*?)"')
findJobwelf = re.compile(r'"jobwelf_list":(.*?),"attribute_text":')
def getData():
datalist = []
for i in range(1,41):
url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,python,2,'+ str(i) +'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
html = askURL(url)
links = re.findall(findLink, html)
li = []
for link in links:
link = re.sub('\\\\', "", link)
li.append(link)
names = re.findall(findName, html)
salarys = re.findall(findSalary, html)
sa = []
satypes = []
for salary in salarys:
salary = re.sub('\\\\', "", salary)
if(salary):
if(salary[-1] == '日'):
salarytype = '日'
elif (salary[-1] == '月'):
salarytype = '月'
elif (salary[-1] == '年'):
salarytype = '年'
salary = salary[0:-2]
if(salary[-1] == '万'):
if ('-' in salary):
pre = re.split(r'[-]', salary)[0]
pre = str(int(float(pre) * 10))
rear = re.split(r'[-]', salary)[1]
rear = str(int(float(rear[0:-1]) * 10))
salary = pre + '-' + rear + 'K'
elif (salary[-1] == '千'):
if ('-' in salary):
salary = salary[0:-1] + 'K'
else:
salary = '面议'
salarytype = '面议'
satypes.append(salarytype)
sa.append(salary)
attributes = re.findall(findAttribute, html)
ats = []
for attribute in attributes:
attribute = re.sub('"', '', attribute)
attribute = re.sub('\[', '', attribute)
attribute = re.sub(']', '', attribute)
attribute = re.sub('\\\\', '', attribute)
at = attribute.split(',')
ats.append(at)
cnames = re.findall(findCname, html)
ctypes = re.findall(findCtype, html)
csizes = re.findall(findCsize,html)
jobwelfs = re.findall(findJobwelf, html)
for jobwelf in jobwelfs:
jobwelf = re.sub('"', '', jobwelf)
jobwelf = re.sub('\[', '', jobwelf)
jobwelf = re.sub(']', '', jobwelf)
jobwelf = jobwelf.split(',')
i = 0
for i in range(50):
data = []
data.append(li[i])
data.append(names[i])
try:
if (sa):
if (len(sa) > 0):
data.append(sa[i])
else:
data.append('面议')
except :
pass
data.append(satypes[i])
if(len(ats[i])==4):
data.append(ats[i][1])
data.append(ats[i][2])
data.append(ats[i][3])
elif ((len(ats[i]) == 3)):
data.append('校招')
data.append(ats[i][1])
data.append(ats[i][2])
elif ((len(ats[i]) == 2)):
data.append('校招')
data.append('经验不限')
data.append(ats[i][1])
data.append(cnames[i])
data.append(ctypes[i])
data.append(csizes[i])
datalist.append(data)
return datalist
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
}
request = urllib.request.Request(url,headers=head)
html = ''
try:
response = urllib.request.urlopen(request)
html = response.read().decode('GBK')
except urllib.error.URLError as e:
if hasattr(e,'code'):
print(e.code)
if hasattr(e,'reason'):
print(e.reason)
return html
def saveData(datalist,savepath):
print('save...')
job = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = job.add_sheet('51job python 杭州',cell_overwrite_ok=True)
col = ('链接','工作名','薪资','计算方式','工作经验','学历','招收人数','公司名','公司类型','公司规模')
for i in range(0,10):
sheet.write(0,i,col[i])
for i in range(0,2000):
data = datalist[i]
for j in range(0,10):
sheet.write(i+1,j,data[j])
job.save(savepath)
def saveDbDate(datalist,savedbpath):
init_db(savedbpath)
conn = sqlite3.connect(savedbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
data[index] = '"' + data[index] + '"'
sql = """
insert into job
(link,jbname,jbsalary,salarytype,experience,education,neednum,cname,ctype,csize)
values (%s)"""%','.join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(savedbpath):
sql = """
create table job
(
id integer key authorization increment,
link text,
jbname varchar ,
jbsalary text ,
salarytype text ,
experience text ,
education text ,
neednum text ,
cname varchar ,
ctype text ,
csize text
)
"""
conn = sqlite3.connect(savedbpath)
cur = conn.cursor()
cur.execute(sql)
conn.commit()
conn.close()
if __name__ == '__main__':
main()
print("爬取完毕")