1.boss直聘的cookie过期很快,每次只能爬取3-4页。就是过期了再复制一遍。
import requests
from lxml import etree
import pandas as pd
url_1 = "https://www.zhipin.com/c101180100/?query=%E5%A4%96%E8%B4%B8%E4%B8%9A%E5%8A%A1%E5%91%98&page="
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"cookie": "此处省略"
}
requests_get = requests.get(url,headers=headers)
requests_get.encoding="utf-8"
2.首先找到网页的规律,复制2-3页网址对比看看,得出规律
url='https://www.zhipin.com/c101200100/?query=%E8%B7%A8%E5%A2%83%E7%94%B5%E5%95%86%E8%BF%90%E8%90%A5&page
='+str(page)+'&ka=page-'+str(page)
3.然后开始用for循环,因为cookie很容易过期,所以每次只爬取3页。
name1=[]
for page in range(1,4):
url=url_1+str(page)+'&ka=page-'+str(page)
requests_get = requests.get(url,headers=headers)
html = etree.HTML(requests_get.text)
name2=html.xpath('//div[@class="company-text"]/h3/a/text()')
name1.extend(name2)
name4=[]
for page in range(4,7):
url=url_1+str(page)+'&ka=page-'+str(page)
requests_get = requests.get(url,headers=headers)
html = etree.HTML(requests_get.text)
name7=html.xpath('//div[@class="company-text"]/h3/a/text()')
name4.extend(name7)
name4
4.如此一共写到12,然后保存到excel
company_name=name1+name4+name7+name12
df=pd.DataFrame()
df['公司名称']=company_name
df.to_excel('C:/Users/123/Desktop/数据.xlsx',index=False)
5.保存好之后新开一个页面重新写企查查的爬虫
import requests
from lxml import etree
import pandas as pd
df=pd.read_excel('C:/Users/123/Desktop/数据.xlsx')
name=list(df["公司名称"])
len(name)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"cookie": "此处省略"}
url="https://www.qcc.com/web/search?key=%E5%87%A4%E5%B7%A2%E8%B7%A8%E5%A2%83"
requests_get = requests.get(url,headers=headers)
requests_get.encoding="utf-8"
html = etree.HTML(requests_get.text)
6.爬取公司名字的时候会出现逗号隔开的情况,可以用=‘’.join(name1)解决。
company_name=[]
for i in range(1,524):
url="https://www.qcc.com/web/search?key="+name[i]+"&filter=%7B%22rchain%22%3A%5B%7B%22pr%22%3A%22HEN%22%7D%5D%7D"
requests_get = requests.get(url,headers=headers)
requests_get.encoding="utf-8"
html = etree.HTML(requests_get.text)
name1=html.xpath('*//*[@class="maininfo"]/a//*/text()')
complete_name=''.join(name1)
company_name.append(complete_name)
company_name
company_address=[]
for i in range(1,524):
url="https://www.qcc.com/web/search?key="+name[i]+"&filter=%7B%22rchain%22%3A%5B%7B%22pr%22%3A%22HEN%22%7D%5D%7D"
requests_get = requests.get(url,headers=headers)
requests_get.encoding="utf-8"
html = etree.HTML(requests_get.text)
name2=html.xpath('*//*[@class="relate-info"]/div[3]/span/span/text()')
company_address.append(name2)
company_address
company_num=[]
for i in range(1,524):
url="https://www.qcc.com/web/search?key="+name[i]+"&filter=%7B%22rchain%22%3A%5B%7B%22pr%22%3A%22HEN%22%7D%5D%7D"
requests_get = requests.get(url,headers=headers)
requests_get.encoding="utf-8"
html = etree.HTML(requests_get.text)
name3=html.xpath('//*[@class="val"]/*[2]/text()')
company_num.append(name3)
company_num
df1=pd.DataFrame()
df1['公司地址']=company_address
df1['公司名称']=company_name
df1['公司电话']=company_num
df1
df1.to_excel('C:/Users/123/Desktop/数据1.xlsx',index=False)