import chardet
from bs4 import BeautifulSoup
import requests
import time
import random
import json
import re
if __name__ =='__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.47 Safari/537.36 Edg/87.0.664.30'
}
url = 'https://jobs.51job.com/'
pro=[['117.31.103.182', '121.234.54.155', '60.161.248.31', '114.96.171.215', '112.98.54.231',
'139.212.201.101', '115.150.210.69', '42.54.83.183', '111.72.136.167', '117.57.91.45',
'117.24.81.21', '60.169.95.239', '125.116.20.225', '27.44.212.116', '36.7.249.37', '114.103.61.144',
'113.141.222.238', '117.94.117.109', '122.232.194.235', '112.83.181.28', '223.244.193.44',
'117.57.20.172', '223.215.182.59', '60.166.181.137', '114.104.143.107', '183.165.33.92',
'121.56.215.95', '139.208.48.96', '182.247.93.238', '36.33.31.221']]
#text = requests.get(url=url,headers=headers)
text = requests.get(url, proxies={'http': random.choice(pro)}, headers=headers)
#text.encoding='utf-8'
e = chardet.detect(text.content)['encoding']
text.encoding=e
page_text =text.text
soup = BeautifulSoup(page_text, 'html5lib')
#print(soup)
dd_list = soup.find_all ( "div" , class_ = "e e5" )
#print(dd_list)
# print(len(dd_list))
del dd_list[76:]
print(len(dd_list))
#print(dd_list)
onename=[]
twoname=[]
URLs=[]
i = 0
while i < 76:
p_list = dd_list[(i)].find_all('strong')
#print(p_list)
# for i in p_list:
# title1=i.string
# print(title1)
a_list= dd_list[(i)].find_all('a')
del a_list[0]
onename.append(p_list)
twoname.append(a_list)
BBQ=[]
for a in dd_list[(i)].find_all('a', href=True):
if a.get_text(strip=True):
url = a['href']
URLs.append(url)
BBQ.append(url)
num = (len(BBQ))
#print(url)
for s in p_list:#爬取一级标题
title1=s.string
print(title1, BBQ[0])
B=1
for j in a_list:#二级标题
title2=j.string
num=num+1
print(title2,BBQ[(B)])
B=B+1
#print(len(SJX))
i=i+1
#print(a_list)
print(len(a_list))
print(len(URLs))
MAXURL=[]
for U in URLs:
print(U)
R1=U+'p1/'
MAXURL.append(R1)
R2=U+'p2/'
MAXURL.append(R2)
R3 = U + 'p3/'
MAXURL.append(R3)
R4 = U + 'p4/'
MAXURL.append(R4)
print(len(MAXURL))
# print(onename)
# print(twoname)
# print(len(twoname))
# for i in onename:
# tit = i.string
# print(len(tit))
fp = open('58(100000).txt', 'a', encoding='utf-8')
S1=0
i2 = []
for url1 in MAXURL:
#url1='https://jobs.51job.com/houduankaifa/'
text1 = requests.get(url1, proxies={'http': random.choice(pro)}, headers=headers)
# text.encoding='utf-8'
e1 = chardet.detect(text1.content)['encoding']
text1.encoding = e1
page_text1 = text1.text
soup1 = BeautifulSoup(page_text1, 'html5lib')
# print(soup1)
div_list1 = soup1.find_all("p", class_="info")
print(div_list1)
for i1 in div_list1:
print(i1)
print(len(i1))
ABC=[]
for abc in i1:
cba=abc.string
#print(cba)
ABC.append(cba)
print(ABC)
print('---------------------')
print(ABC[0])
#print(i2)
#print(i1)
print(ABC, file=fp)
fp.close
# fp.write(i1)
# fp.close()
# with open('58.txt', 'a' ) as f:
# f.write(i1.encode('utf-8'))
N=(len(div_list1))
print(N)
S1=S1+N
print(S1)
print('-----------------------------')
#print(dd_list)
# for i in dd_list:
# print(i)
对51(100000).txt处理
import collections
word=open("58(100000).txt",encoding='utf-8')
word=word.read()
print(word)
word = word.replace('[','')
word = word.replace(']','')
word1 = word.replace("'",'')
print(word1)
with open('58(100000)01.txt','w',encoding='utf-8') as fpw:
fpw.write(word1)
处理文件
导入excel什么什么巴拉巴拉的