主要是字体反爬虫,逐一破解即可
#爬取实习僧 最新版本
import requests
from bs4 import BeautifulSoup
kv={"user-agent":'Mozilla/5.0'}
def detail_page(url):
req = requests.get(url,headers=kv)
html = req.text
soup = BeautifulSoup(html,'lxml')
job_name = soup.select('.new_job_name span')[0].string
job_company = soup.select('.com-name')[0].string
job_money = soup.select('.job_msg span')[0].string
job_money = job_money.encode("utf-8")
job_money = job_money.replace(b'\xee\x8c\x85',b'0')
job_money = job_money.replace(b'\xee\xaa\x9f',b'1')
job_money = job_money.replace(b'\xee\x9f\xa1',b'2')
job_money = job_money.replace(b'\xef\x8a\x97',b'3')
job_money = job_money.replace(b'\xee\x82\xab',b'4')
job_money = job_money.replace(b'\xee\x82\x87',b'5')
job_money = job_money.replace(b'\xee\xad\xb7',b'6')
job_money = job_money.replace(b'\xee\xae\x80',b'7')
job_money = job_money.replace(b'\xee\xb2\xbf',b'8')
job_money = job_money.replace(b'\xee\xb3\x9e',b'9')
job_money = job_money.decode()
job_academic = soup.select('.job_academic')[0].string
job_time = soup.select('.job_time')[0].string
job_time = job_time.encode("utf-8")
job_time = job_time.replace(b'\xee\x8c\x85',b'0')
job_time = job_time.replace(b'\xee\xaa\x9f',b'1')
job_time = job_time.replace(b'\xee\x9f\xa1',b'2')
job_time = job_time.replace(b'\xef\x8a\x97',b'3')
job_time = job_time.replace(b'\xee\x82\xab',b'4')
job_time = job_time.replace(b'\xee\x82\x87',b'5')
job_time = job_time.replace(b'\xee\xad\xb7',b'6')
job_time = job_time.replace(b'\xee\xae\x80',b'7')
job_time = job_time.replace(b'\xee\xb2\xbf',b'8')
job_time = job_time.replace(b'\xee\xb3\x9e',b'9')
job_time = job_time.decode()
job_week = soup.select('.job_week')[0].string
job_week = job_week.encode("utf-8")
job_week = job_week.replace(b'\xee\x8c\x85',b'0')
job_week = job_week.replace(b'\xee\xaa\x9f',b'1')
job_week = job_week.replace(b'\xee\x9f\xa1',b'2')
job_week = job_week.replace(b'\xef\x8a\x97',b'3')
job_week = job_week.replace(b'\xee\x82\xab',b'4')
job_week = job_week.replace(b'\xee\x82\x87',b'5')
job_week = job_week.replace(b'\xee\xad\xb7',b'6')
job_week = job_week.replace(b'\xee\xae\x80',b'7')
job_week = job_week.replace(b'\xee\xb2\xbf',b'8')
job_week = job_week.replace(b'\xee\xb3\x9e',b'9')
job_week = job_week.decode()
job_position = soup.select('.job_position')[0].string
contents = {"工作名字:{},公司:{},工薪:{},要求学历:{},工作时间:{},实习周期:{},地点:{}".format(job_name,job_company,job_money,job_academic,job_time,job_week,job_position)}
print(contents)
for page in range(1,2):
url = 'https://www.shixiseng.com/bj/it/{}'.format(page)
req = requests.get(url,headers=kv)
html = req.text
soup = BeautifulSoup(html,'lxml')
articles = soup.select('p a')
##print(articles)
for item in articles:
detail_url = item.get("href")
if len(detail_url) >48:
new = detail_url
detail_page(new)
#detail_page("https://www.shixiseng.com/intern/inn_ss9mdklhvlsx") 破解编码
'''b'\xee\xaa\x9f\xee\x8c\x85\xee\x8c\x85-\xee\xaa\x9f\xee\x82\x87\xee\x8c\x85/\xe5\xa4\xa9'
100-150/天 3个
\xee\xaa\x9f 1
\xee\x8c\x85 0
\xee\x82\x87 5
b'\xee\xaa\x9f\xee\x8c\x85\xee\x8c\x85-\xee\xaa\x9f\xee\x9f\xa1\xee\x8c\x85/\xe5\xa4\xa9'
100-120
\xee\x9f\xa1 2
b'\xee\xaa\x9f\xee\x82\x87\xee\x8c\x85-\xef\x8a\x97\xee\x8c\x85\xee\x8c\x85/\xe5\xa4\xa9'
150-300
\xef\x8a\x97 3
b'\xee\xaa\x9f\xee\x9f\xa1\xee\x8c\x85-\xee\xaa\x9f\xee\xb2\xbf\xee\x8c\x85/\xe5\xa4\xa9'
120-180
\xee\xb2\xbf 8
b'\xee\xaa\x9f\xee\x82\xab\xee\x8c\x85-\xee\x9f\xa1\xee\x8c\x85\xee\x8c\x85/\xe5\xa4\xa9'
140-200
\xee\x82\xab 4
b'\xee\x82\xab\xee\x8c\x85-\xee\xad\xb7\xee\x8c\x85/\xe5\xa4\xa9'
\xee\xad\xb7 6
b'\xee\xaa\x9f\xee\x8c\x85\xee\x8c\x85-\xee\xaa\x9f\xee\xb3\x9e\xee\xae\x80/\xe5\xa4\xa9'
100-197
\xee\xb3\x9e 9
\xee\xae\x80 7
'''