爬虫(猫眼电影+校花网+github+今日头条+拉钩)

Requests+正则表达式爬取猫眼TOP100榜电影信息

MARK:将信息写入文件解决乱码方法,开启进程池秒爬。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import  requests
from  requests.exceptions  import  RequestException
import  re
import  json
from  multiprocessing  import  Pool
 
 
def  get_one_page(url):
     try :
         response  =  requests.get(url)
         if  response.status_code  = =  200 :
             return  response.text
         return  None
     except  RequestException:
         return  None
 
 
def  parse_one_page(html):
     pattern  =  re. compile ( '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                          +  '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                          +  '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' , re.S)
     items  =  re.findall(pattern, html)
     for  item  in  items:
         yield  {
             '排行' : item[ 0 ],
             '图片' : item[ 1 ],
             '电影' : item[ 2 ],
             '演员' : item[ 3 ].strip()[ 3 :],
             '上映信息' : item[ 4 ].strip()[ 5 :],
             '评分' : item[ 5 +  item[ 6 ]
         }
 
 
def  write_to_file(content):
     with  open ( 'result.txt' 'a' , encoding = 'utf-8' ) as f:
         f.write(json.dumps(content, ensure_ascii = False +  '\n' )
 
 
def  main(offset):
     url  =  'http://maoyan.com/board/4?offset='  +  str (offset)
     html  =  get_one_page(url)
     for  item  in  parse_one_page(html):
         print (item)
         write_to_file(item)
 
 
if  __name__  = =  '__main__' :
     # for i in range(10):
     #   main(i*10)
     pool  =  Pool()   # 进程池 多进程
     pool. map (main, [i  *  10  for  in  range ( 10 )])

Requests+正则表达式爬取校花网视频

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import  requests
import  re
import  os
 
 
def  get_page(url):
     try :
         response  =  requests.get(url)
         response.raise_for_status()
         response.encoding  =  response.apparent_encoding
         return  response.text
     except :
         print ( "爬取失败" )
 
 
def  get_url(html):
     pattern  =  re. compile ( 'class="items".*?href="(.*?)"' , re.S)
     urls  =  re.findall(pattern, html)
     for  url  in  urls:
         if  not  url.startswith( 'http' ):
             url  =  'http://www.xiaohuar.com'  +  url
         yield  url
 
 
def  get_detail_url(detail_content):
     pattern  =  re. compile ( 'id="media".*?src="(.*?)"' , re.S)
     urls  =  re.findall(pattern, detail_content)
     for  url  in  urls:
         if  url:
             if  url.endswith( '.mp4' ):
                 yield  url
 
 
def  download(url):
     root  =  "D://movie2//"
     path  =  root  +  url.split( '/' )[ - 1 ]
     try :
         if  not  os.path.exists(root):
             os.mkdir(root)
         if  not  os.path.exists(path):
             response  =  requests.get(url)
             # with open(path, 'wb') as f:
             #     f.write(response.content)
 
             with  open (path,  'wb' ) as f:
                 for  line  in  response.iter_content():
                     f.write(line)
                 print ( "文件保存成功" )
         else :
             print ( "文件已存在" )
     except :
         print ( "下载失败" )
 
 
def  main(page_num):
     url  =  'http://www.xiaohuar.com/list-3-{0}.html' . format (page_num)
     html  =  get_page(url)
     urls  =  get_url(html)
     for  url  in  urls:
         detail_content  =  get_page(url)
         detail_urls  =  get_detail_url(detail_content)
         for  detail_url  in  detail_urls:
             download(detail_url)
 
 
if  __name__  = =  '__main__' :
     for  num  in  range ( 30 ):
         main(num)

Requests+PyQuery模拟登陆github

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
import  requests
from  pyquery  import  PyQuery
 
LOGIN_URL  =  'https://github.com/login'
SESSION_URL  =  'https://github.com/session'
session  =  requests.session()
response  =  session.get(LOGIN_URL)
text  =  PyQuery(response.text)
authenticity_token  =  text( '#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)' ).attr( 'value' )
data  =  {
     'commit' 'Sign in' ,
     'utf8' '✓' ,
     'authenticity_token' : authenticity_token,
     'login' 'lcgsmile@qq.com' ,
     'password' 'lcg@pwd.'
}
response  =  session.post(SESSION_URL, data = data)
print (response.status_code)   # 200

分析Ajax请求并抓取今日头条街拍美图 

配置文件config.py

1
2
3
4
5
6
7
MONGO_URL  =  'localhost'
MONGO_DB  =  'toutiao'
MONGO_TABLE  =  'toutiao'
 
GROUP_START  =  1
GROUP_END  =  20
KEYWORD  =  '街拍'

主爬虫文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import  json
import  os
from  urllib.parse  import  urlencode
import  pymongo
import  requests
from  bs4  import  BeautifulSoup
from  requests.exceptions  import  ConnectionError
import  re
from  multiprocessing  import  Pool
from  hashlib  import  md5
from  json.decoder  import  JSONDecodeError
from  config  import  *
 
client  =  pymongo.MongoClient(MONGO_URL, connect = False )   # 多进程抓取connect=False
db  =  client[MONGO_DB]
 
 
def  get_page_index(offset, keyword):
     """
     爬取索引页
     """
     data  =  {
         'autoload' 'true' ,
         'count' 20 ,
         'cur_tab' 3 ,
         'format' 'json' ,
         'keyword' : keyword,
         'offset' : offset,
     }
     params  =  urlencode(data)   # 将字典类型构造成url的请求参数
     base  =  'http://www.toutiao.com/search_content/'
     url  =  base  +  '?'  +  params
     try :
         response  =  requests.get(url)
         if  response.status_code  = =  200 :
             return  response.text
         return  None
     except  ConnectionError:
         print ( 'Error occurred' )
         return  None
 
 
def  download_image(url):
     """
     下载图片
     """
     print ( 'Downloading' , url)
     try :
         response  =  requests.get(url)
         if  response.status_code  = =  200 :
             save_image(response.content)
         return  None
     except  ConnectionError:
         return  None
 
 
def  save_image(content):
     """
     保存图片
     """
     file_path  =  '{0}/{1}.{2}' . format (os.getcwd(), md5(content).hexdigest(),  'jpg' )
     # 用一个md5哈希生成的文件名防止重复
     print (file_path)
     if  not  os.path.exists(file_path):
         with  open (file_path,  'wb' ) as f:
             f.write(content)
 
 
def  parse_page_index(text):
     """
     解析数据
     """
     try :
         data  =  json.loads(text)   # json字符串转换成字典
         if  data  and  'data'  in  data.keys():
             for  item  in  data.get( 'data' ):
                 yield  item.get( 'article_url' )
     except  JSONDecodeError:
         pass
 
 
def  get_page_detail(url):
     """
     请求详情页
     """
     try :
         response  =  requests.get(url)
         if  response.status_code  = =  200 :
             return  response.text
         return  None
     except  ConnectionError:
         print ( 'Error occurred' )
         return  None
 
 
def  parse_page_detail(html, url):
     """
     解析详情页
     """
     soup  =  BeautifulSoup(html,  'lxml' )
     result  =  soup.select( 'title' )
     title  =  result[ 0 ].get_text()  if  result  else  ''
     images_pattern  =  re. compile ( 'gallery: JSON.parse\("(.*)"\)' , re.S)
     result  =  re.search(images_pattern, html)
     if  result:
         data  =  json.loads(result.group( 1 ).replace( '\\', ' '))
         if  data  and  'sub_images'  in  data.keys():
             sub_images  =  data.get( 'sub_images' )
             images  =  [item.get( 'url' for  item  in  sub_images]
             for  image  in  images: download_image(image)
             return  {
                 'title' : title,
                 'url' : url,
                 'images' : images
             }
 
 
def  save_to_mongo(result):
     """
     将数据插入到MongoDB
     """
     if  db[MONGO_TABLE].insert(result):
         print ( 'Successfully Saved to Mongo' , result)
         return  True
     return  False
 
 
def  main(offset):
     text  =  get_page_index(offset, KEYWORD)
     urls  =  parse_page_index(text)
     for  url  in  urls:
         html  =  get_page_detail(url)
         result  =  parse_page_detail(html, url)
         if  result: save_to_mongo(result)
 
 
if  __name__  = =  '__main__' :
     pool  =  Pool()
     groups  =  ([x  *  20  for  in  range (GROUP_START, GROUP_END  +  1 )])
     pool. map (main, groups)
     pool.close()
     pool.join()

拉勾网自动投递简历

import requests
import re

# 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)


# 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:
#   X-Anit-Forge-Code
#   X-Anit-Forge-Token
#   X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
             headers={
                 'Referer': 'https://passport.lagou.com/login/login.html',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 'X-Anit-Forge-Code': X_Anti_Forge_Code,
                 'X-Anit-Forge-Token': X_Anti_Forge_Token,
                 'X-Requested-With': 'XMLHttpRequest'
             },
             data={
                 'isValidate': True,
                 'username': '18611453110',
                 'password': '70621c64832c4d4d66a47be6150b4a8e'
             }
             )

# 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
            headers={
                'Referer': 'https://passport.lagou.com/login/login.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
            }
            )

# 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
                       headers={
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                       }
                       )

# print('18611453110' in response.text)



# 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
#   请求参数:
#     gj:3年及以下
#     xl:不要求
#     jd:不需要融资
#     hy:移动互联网
#     px:default
#     yx:15k-25k
#     city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest

# 4、请求体:
# first:true
# pn:1
# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url)


response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                        params={
                            # 'gj': '3年及以下',
                            # 'xl': '不要求',
                            # 'jd': '不需要融资',
                            # 'hy': '移动互联网',
                            'px': 'default',
                            'yx': '15k-25k',
                            'city': '北京',
                            'district': '海淀区',

                        },
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'Referer': url,

                        })

# print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
    fullname = comanpy_info['companyFullName']
    emp_num = comanpy_info['companySize']
    salary = comanpy_info['salary']
    workyear = comanpy_info['workYear']
    positionName = comanpy_info['positionName']
    positionId = comanpy_info['positionId']
    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)
    print(fullname)
    print(emp_num)
    print(salary)
    print(workyear)
    print(positionName)
    print(positionId)
    print()

    # 3、============================================爬取职位信息
    # 第一步:请求详情页:
    # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
    # 2、请求的方式:GET
    # 3、请求头:
    #    User-Agent
    r1 = session.get(detail_url,
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     }
                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步:投递简历
    # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    # 2、请求的方式:POST
    # 3、请求头:
    # User-Agent
    # Referer:detail_url
    # X-Anit-Forge-Code:31832262
    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
    # X-Requested-With:XMLHttpRequest

    # 4、请求体:
    # 'positionId':3984845
    # 'type':1
    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'Referer': detail_url,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'positionId': positionId,
                     'type': 1,
                     'force': True
                 }

                 )

    print('投递成功',detail_url)

lagou

 

复制代码
import requests
import re

# 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)


# 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:
#   X-Anit-Forge-Code
#   X-Anit-Forge-Token
#   X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
             headers={
                 'Referer': 'https://passport.lagou.com/login/login.html',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 'X-Anit-Forge-Code': X_Anti_Forge_Code,
                 'X-Anit-Forge-Token': X_Anti_Forge_Token,
                 'X-Requested-With': 'XMLHttpRequest'
             },
             data={
                 'isValidate': True,
                 'username': '18611453110',
                 'password': '70621c64832c4d4d66a47be6150b4a8e'
             }
             )

# 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
            headers={
                'Referer': 'https://passport.lagou.com/login/login.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
            }
            )

# 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
                       headers={
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                       }
                       )

# print('18611453110' in response.text)



# 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
#   请求参数:
#     gj:3年及以下
#     xl:不要求
#     jd:不需要融资
#     hy:移动互联网
#     px:default
#     yx:15k-25k
#     city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest

# 4、请求体:
# first:true
# pn:1
# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url)


response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                        params={
                            # 'gj': '3年及以下',
                            # 'xl': '不要求',
                            # 'jd': '不需要融资',
                            # 'hy': '移动互联网',
                            'px': 'default',
                            'yx': '15k-25k',
                            'city': '北京',
                            'district': '海淀区',

                        },
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'Referer': url,

                        })

# print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
    fullname = comanpy_info['companyFullName']
    emp_num = comanpy_info['companySize']
    salary = comanpy_info['salary']
    workyear = comanpy_info['workYear']
    positionName = comanpy_info['positionName']
    positionId = comanpy_info['positionId']
    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)
    print(fullname)
    print(emp_num)
    print(salary)
    print(workyear)
    print(positionName)
    print(positionId)
    print()

    # 3、============================================爬取职位信息
    # 第一步:请求详情页:
    # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
    # 2、请求的方式:GET
    # 3、请求头:
    #    User-Agent
    r1 = session.get(detail_url,
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     }
                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步:投递简历
    # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    # 2、请求的方式:POST
    # 3、请求头:
    # User-Agent
    # Referer:detail_url
    # X-Anit-Forge-Code:31832262
    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
    # X-Requested-With:XMLHttpRequest

    # 4、请求体:
    # 'positionId':3984845
    # 'type':1
    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'Referer': detail_url,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'positionId': positionId,
                     'type': 1,
                     'force': True
                 }

                 )

    print('投递成功',detail_url)
import requests
import re

# 1、============================================认证流程
session = requests.session()
# 第一步:
# 请求的URL:https://passport.lagou.com/login/login.html,
# 请求的方法GET,
# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)


# 第二步:
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST,
# 3、请求头:
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:
#   X-Anit-Forge-Code
#   X-Anit-Forge-Token
#   X-Requested-With
# 4、请求体:
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
             headers={
                 'Referer': 'https://passport.lagou.com/login/login.html',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 'X-Anit-Forge-Code': X_Anti_Forge_Code,
                 'X-Anit-Forge-Token': X_Anti_Forge_Token,
                 'X-Requested-With': 'XMLHttpRequest'
             },
             data={
                 'isValidate': True,
                 'username': '18611453110',
                 'password': '70621c64832c4d4d66a47be6150b4a8e'
             }
             )

# 第三:
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET,
# 3、请求头:
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
            headers={
                'Referer': 'https://passport.lagou.com/login/login.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
            }
            )

# 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
                       headers={
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                       }
                       )

# print('18611453110' in response.text)



# 2、============================================爬取职位信息
# 1、请求的url:https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式:POST
#   请求参数:
#     gj:3年及以下
#     xl:不要求
#     jd:不需要融资
#     hy:移动互联网
#     px:default
#     yx:15k-25k
#     city:全国
# 3、请求头:
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest

# 4、请求体:
# first:true
# pn:1
# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url)


response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                        params={
                            # 'gj': '3年及以下',
                            # 'xl': '不要求',
                            # 'jd': '不需要融资',
                            # 'hy': '移动互联网',
                            'px': 'default',
                            'yx': '15k-25k',
                            'city': '北京',
                            'district': '海淀区',

                        },
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'Referer': url,

                        })

# print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
    fullname = comanpy_info['companyFullName']
    emp_num = comanpy_info['companySize']
    salary = comanpy_info['salary']
    workyear = comanpy_info['workYear']
    positionName = comanpy_info['positionName']
    positionId = comanpy_info['positionId']
    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)
    print(fullname)
    print(emp_num)
    print(salary)
    print(workyear)
    print(positionName)
    print(positionId)
    print()

    # 3、============================================爬取职位信息
    # 第一步:请求详情页:
    # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html
    # 2、请求的方式:GET
    # 3、请求头:
    #    User-Agent
    r1 = session.get(detail_url,
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     }
                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步:投递简历
    # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    # 2、请求的方式:POST
    # 3、请求头:
    # User-Agent
    # Referer:detail_url
    # X-Anit-Forge-Code:31832262
    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
    # X-Requested-With:XMLHttpRequest

    # 4、请求体:
    # 'positionId':3984845
    # 'type':1
    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'Referer': detail_url,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'positionId': positionId,
                     'type': 1,
                     'force': True
                 }

                 )

    print('投递成功',detail_url)

lagou

 

转载于:https://www.cnblogs.com/yunlongaimeng/p/9802151.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值