分享基于Python的3个爬虫案例(附源码)

案例1:使用爬虫爬取京东华为手机用户评论

本案例主要是通过京东华为手机页面爬取了用户的评论数据,便于对华为本款手机的性能、质量、价格等多维度进行用户方面的分析。

import requests
import json

class Jdcomment\_spider():
    def \_\_init\_\_(self,file\_name='jingdong\_comment'):
        self.headers \= {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
    
        #打开文件
        self.fp = open(f'./{file\_name}.txt','w',encoding='utf-8')
        print(f'爬虫开始,打开{file\_name}文件!')
    def parse\_one\_page(self,url):
        #京东华为评论的URL,需要去除callback后缀
        #url = 'https://club.jd.com/comment/productPageComments.action?productId=10025237646790&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1'
        #打印响应
        response = requests.get(url,headers=self.headers)
        #print(response.text)

        #告诉服务器用Python requests发出请求
        #print(response.request.headers)

        #第一种 将json格式字符串转换为字典
        js\_data = json.loads(response.text,strict=False)
        #print(type(js\_data))
        #第二种 
        #js\_data = response.json()

        #数据提取
        comment\_list = js\_data\['comments'\]
        #print(comment\_list)
        for comment in comment\_list:
            #提取商品id
            goods\_id = comment.get('id')
            #提取名称
            nickname = comment.get('nickname')
            print(nickname)
            #提取评分
            score = comment.get('score')
            #提取产品类型
            productSize = comment.get('productSize')
            #提取产品颜色
            productColor = comment.get('productColor')
            #提取评论时间
            creationTime = comment.get('creationTime')
            #提取评论内容
            content = comment.get('content')
            #换行符替换空格 或split分离
            content = content.replace('\\n',' ')
            print(content)
            
            #存储数据
            self.fp.write(f'{goods\_id}\\t{nickname}\\t{score}\\t{productSize}\\t{productColor}\\t{creationTime}\\t{content}\\n')
    
    def parse\_max\_page(self):
        for page\_num in range(70):
            print(f'正在抓取第{page\_num}页的内容')
            url \= f'https://club.jd.com/comment/productPageComments.action?productId=10025237646790&score=0&sortType=5&page={page\_num}&pageSize=10&isShadowSku=0&fold=1'
            self.parse\_one\_page(url\=url)  
    
    def close\_files(self):
        self.fp.close()
        print('爬虫结束,关闭文件!')      

if \_\_name\_\_ == '\_\_main\_\_':
    #实例化对象
    jd\_spider = Jdcomment\_spider()
    #调用方法
    jd\_spider.parse\_max\_page()
    jd\_spider.close\_files()

案例2:使用爬虫下载百度学术论文

本案例主要是通过百度学术与SCI-HUB联合的方式来获取下载文献,主要步骤为进入百度学术搜索界面获取关键词搜索后的URL,在关键词跳转后的论文页面选择查看源代码找到DOI处(并采用正则表达式进行提取),提取完DOI后利用SCI-HUB搜索文献的URL方式来获取PDF下载的网址。

 1 #导入模块
 2 import requests
 3 import re
 4 import os
 5 from urllib.request import urlretrieve
 6 
 7 #获取URL信息
 8 def get\_url(key):
 9     url = 'https://xueshu.baidu.com/s?wd=' + key + '&rsv\_bp=0&tn=SE\_baiduxueshu\_c1gjeupa&rsv\_spt=3&ie=utf-8&f=8&rsv\_sug2=0&sc\_f\_para=sc\_tasktype%3D%7BfirstSimpleSearch%7D'
10     return url
11 
12 #获取headers 反爬虫
13 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
14            'cookie':'PSTM=1566269439; BIDUPSID=3E682072B0A8C093085B76FBCE0C034D; MCITY=-%3A; BAIDUID=320C35B2412D12FCFA87BEAAE26FAC75:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; \_\_yjs\_duid=1\_ecde685c2a213f89118f49f95351d0131616728035849; BDSFRCVID\_BFESS=Bc-OJeC624mAqbveVwGaU7iYMxe-PnbTH6aoaUI4HUrI-lBClSPbEG0P\_f8g0Ku-jgOsogKKyeOTHu8F\_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H\_BDCLCKID\_SF\_BFESS=tJkt\_K-5JKvjD4-k247Hhn8thmT22-us0DAL2hcH0KLKMb6qKt5-bqKWQUQPt-ckB6b-sxJ8Kfb1MRjv-qozjMkAK4uL2UjmaN7T3q5TtUJreCnTDMRhqtIsXpbyKMniMCT9-pnafpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKu-n5jHj5XeH0D3q; delPer=0; PSINO=5; BD\_HOME=0; BD\_CK\_SAM=1; antispam\_key\_id=45; antispam\_site=ae\_xueshu\_paper; BDRCVFR\[A88o6x7IGkt\]=mk3SLVN4HKm; ab\_sr=1.0.0\_MWE3NGYyMTgxMjY0ZGM2NTcxNDAwMjVjZmNiOWU3YzIwNDA4OWNmZmNlNmM4NWUyZmZkNDVmN2E1OTZjOGZkMWFiNGFjYTU4Yzg4NTEyMDRkYTkzZTJlYTg3OTU0NTdl; antispam\_data=fba5ca43ae000a429d092bba6e092ef3cf7c4c117f92e52dfe3260bb55855d667471475559fa8e05dc8e013a6316afd57f176fcab9710d0fe3eefb2f7799e44a25af15c58c1aae998deb0b9cf008b74e9e8d346b4156cdb351b74869e25b2990; antispam\_sign=adb02c19; BA\_HECTOR=aka0a52gah2504agvt1g68d0h0r; H\_PS\_PSSID=; Hm\_lvt\_43115ae30293b511088d3cbe41ec099c=1617162379,1617179509,1617179667; Hm\_lpvt\_43115ae30293b511088d3cbe41ec099c=1617179667; Hm\_lvt\_f28578486a5410f35e6fbd0da5361e5f=1617162379,1617179509,1617179667; Hm\_lpvt\_f28578486a5410f35e6fbd0da5361e5f=1617179667; BDRCVFR\[w2jhEs\_Zudc\]=mk3SLVN4HKm; BDSVRTM=173'}
15 
16 #提取论文的DOI值
17 def get\_paper\_link(headers,key):
18     response = requests.get(url=get\_url(key),headers=headers)
19     data = response.text
20     paper\_link = re.findall(r'<h3 class=\\"t c\_font\\">\\n + \\n + <a href=\\"(.\*)\\"',data) # ()内容获取论文的网址
21     doi\_list = \[\] #列表接收doi
22     for link in paper\_link:
23         paper\_link = 'http:' + link
24         response2 = requests.get(url=paper\_link,headers=headers)
25         res\_data = response2.text
26         try:
27             paper\_doi = re.findall(r'\\'doi\\'}\\">\\n +(.\*?)\\n ',res\_data)
28             if str(10) in paper\_doi\[0\]:
29                 doi\_list.append(paper\_doi)
30         except:
31             pass
32     return doi\_list
33 
34 #构建scihub下载链接
35 def doi\_download(headers,key):
36     doi\_list = get\_paper\_link(headers,key)
37     for doi in doi\_list:
38         doi\_link = "https://sci-hub.tf/" + doi\[0\]
39         print(doi\_link)
40         
41         if 'https:' not in doi\_link:
42             doi\_link = 'https:' + doi\_link
43         res = requests.get(url=doi\_link,headers=headers)
44         down\_link = re.findall('<iframe.\*?src="(.\*?)" id=.\*?<\\/iframe>',res.text)\[0\]
45         print(down\_link)
46         r = requests.get(url=down\_link,headers=headers)
47         path = doi\_link.split('/')\[-1\] + '.pdf'
48         with open(path,'wb') as f:
49             f.write(r.content)
50             print('下载完毕:'+doi\_link.split('/')\[-1\])
51 
52 key = input('请输入需要下载的论文')
53 doi\_download(headers=headers,key=key)

案例3:智联招聘网站信息获取及分析

本案例主要是通过智联招聘网站获取上海地区相关工作岗位的公司性质、公司工资、公司要求、学历要求、经验要求、年龄要求等多种数据,并通过对所得数据进行简单的分析和处理。

  1 import requests  2 import re  3 import openpyxl  4 import time  5 
  6 head = {  7     'cookie': "x-zp-client-id=780a3405-a5d4-4890-92e1-4664829ca846; sts\_deviceid=17861c0ec7262d-0f5a8f3ca09458-7373e61-2073600-17861c0ec73103; adfcid2=none; adfbid2=0; LastCity=%E4%B8%8A%E6%B5%B7; LastCity%5Fid=538; FSSBBIl1UgzbN7N443S=25F.0sssQmVDwwCXGqZv2j9BDZfsjKtlMZTD.MdZsUai9uY\_xSd8vUXpVziT\_BAC; locationInfo\_search={%22code%22:%22576%22%2C%22name%22:%22%E5%A4%AA%E5%8E%9F%22%2C%22message%22:%22%E5%8C%B9%E9%85%8D%E5%88%B0%E5%B8%82%E7%BA%A7%E7%BC%96%E7%A0%81%22}; \_uab\_collina=161654777909311187692832; at=081be913be4f467dadafa0c178630d50; rt=fc2e0c878e01427b82b32427e4db0ac2; selectCity\_search=538; ssxmod\_itna=eqjxgDyD0Q3WwxGq0dD=wOEBCGOY3Kat7DRAmx0yGueGzDAxn40iDt=oHPhzFSAY4+WDMm20EaxajfRXWKKeOcGbLQn4qGLDmKDySW3w7DxOq0rD74irDDxD3DbRdDSDWKD9zqi3DEnKGfDDoDYf6uDitD4qDBGhdDKqGg8wGtWA=4g4rMGmUtYCGx8qDMmeGXCBWQOeaaaAXWtqGyIPGu0uU9IqbDCO+bfYpGvDp4IAwh57hbmG53SDhr+7mb+DwtfBhGODxw0Dp0xDfxQABQeD; ssxmod\_itna2=eqjxgDyD0Q3WwxGq0dD=wOEBCGOY3Kat7DRADnKSiW5Dsp+DLnaKou/F+dsBtYThjTV43uGLtKrBRhQD6DQ+deiat6hcwV9Zm09TiXhnKnxDO92GOpQwCMOop=Caxz9uc1/WoRt0yhjfeRx7UxYS3xA3m2xWD7QPCxGcDiQPeD==; urlfrom=121122523; urlfrom2=121122523; adfbid=0; sts\_sg=1; sts\_sid=17872e8425f2d9-0783ac3ef1514f-7373e61-2073600-17872e8426044b; sts\_chnlsid=121122523; zp\_src\_url=https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.Kf0000K5cNxA6dzipIkSprEwAQaOYa8tX4WYiAxDWD7I29PDuPkPYZM9hefcGb51rnV7AXHQrtltBH905i\_cjRrjyPVYD9Dko0iYc7c3dh3W2rhUXbyDD-pqP\_45d4QlbcX\_MOIflgmOJ\_cm8Pe-FSnpRnSJzRVxWYIuQg\_VotdiIxV7tPSoZXhX5kA6e\_IvJm6mVHyUZDGcES8kLzQBdcDN9unE.DD\_NR2Ar5Od669BCXgjRzeASFDZtwhUVHf632MRRt\_Q\_DNKnLeMX5DkgboozuPvHWdsHRy2J7jZZOlsfRymoM4EQ9JuIWxDBaurGtIKnLxKfYt\_U\_DY2yQvTyjtLsqT7jHzlRL5spy59OPt5gKfYtVKnv-WF\_tU2lSMkl32AM-9I7fH7fmCuX8a9G4myIrP-SJFWZWlkLfYXLDkexd8WoLurAOtxbOveMmOUSENOoRojPakgkvUSkf.U1Yk0ZDqd\_xKJVgfkoWPSPx8YnQNYnp30ZKGm1Ys0Zfqd\_xKJVgfkoWPSPx8YnQNYnp30A-V5HczPfKM5yqbXWD0Iybqmh7GuZR0TA-b5Hcv0APGujYznHf0UgfqnH0krNtknjDLg1csPH7xnH0YP7tknjc1g1nvnjD0pvbqn0KzIjYvPW00mhbqnHR3g1csP7tznHIxPH010AdW5HDsnj7xnH63rjRdrj6dP7tznjRkg1Dsn-tkg100TgKGujYs0Z7Wpyfqn0KzuLw9u1Ys0A7B5HKxn0K-ThTqn0KsTjYs0A4vTjYsQW0snj0snj0s0AdYTjYs0AwbUL0qn0KzpWYk0Aw-IWdsmsKhIjYs0ZKC5H00ULnqn0KBI1Ykn0K8IjYs0ZPl5fK9TdqGuAnqTZnVmvY0pywW5R9affKYmgFMugfqPWPxn7tkPH00IZN15H6kPH6Ln10LPHm1njTdPWRLrH00ThNkIjYkPWDvrjndPHcdnHfk0ZPGujdWuHnYm1bLnH0snj9bn1NW0AP1UHY3P1uKnj9jwbNAfHPKnDFK0A7W5HD0TA3qn0KkUgfqn0KkUgnqn0KlIjYs0AdWgvuzUvYqn7tsg1KxnH0YP-tsg100uA78IyF-gLK\_my4GuZnqn7tsg1KxnH63nHm4rNtsg100TA7Ygvu\_myTqn0Kbmv-b5H00ugwGujYVnfK9TLKWm1Ys0ZNspy4Wm1Ys0Z7VuWYs0AuWIgfqn0KGTvP\_5H00XMK\_Ignqn0K9uAu\_myTqnfK\_uhnqn0KbmvPb5H0knRR1rHPanbfkwbP7fWD1wWT1PY7Dn1TvnDNjwjnz0Zwzmyw-5HTvnjcsn6KBuA-b5HnznDn1PRczPjDzwjDzwbPAfH6LfbNKfRRzPbuKf1Td0AqW5HD0mMfqn0KEmgwL5H00ULfqnfKETMKY5HDWnan1c1cWnWR3rHc1nWfWnWDsnanznH0sQW0snj0snankc1cWnanVc108nj0snj0sc1D8nj0snj0s0Z91IZRqnWTdP1fLPsKkgLmqna34PdtsQW0sg108njKxna34n7tsQW61g108n1Pxna3zn7tknW60mMPxTZFEuA-b5H00pgPxmLK95H00mL0qn0K-TLfqn0KWThnqPHcvrjT%26xst%3Dm1YsnH77n1b1fWFDnRujwRcknYmLn1IKwjnLPWK7fYf1n67B5HnznDn1PRczPjDzwjDzwbPAfH6LfbNKfRRzPbuKf1Td0gnqnHf3PH6LnWcdPHTLP1DkP1fLPj9xnWcdg10KI1LyktAJdIjA8nL3dSefsVgfko6KTHLyktAJdIjA8nL3dSefsVgfko6KIHYzP1RLPjTL0gfqnHmkPW61PHRzPf7VTHYs0W0aQf7WpjdhmdqsmsD1PWmzrjb4Pj0z%26word%3D%26ck%3D6335.3.86.234.150.183.138.217%26shh%3Dwww.baidu.com%26sht%3Dbaidu%26us%3D1.0.1.0.1.301.0%26wd%3D%26bc%3D110101; ZP\_OLD\_FLAG=false; sensorsdata2015jssdkcross=%7B%22distinct\_id%22%3A%221098833668%22%2C%22first\_id%22%3A%2217861c0ed597c3-038f16a76b7241-7373e61-2073600-17861c0ed5aae6%22%2C%22props%22%3A%7B%22%24latest\_traffic\_source\_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest\_search\_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC\_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest\_referrer%22%3A%22%22%2C%22%24latest\_utm\_source%22%3A%22baiduPC%22%2C%22%24latest\_utm\_medium%22%3A%22CPC%22%2C%22%24latest\_utm\_campaign%22%3A%22pp%22%2C%22%24latest\_utm\_content%22%3A%22tj%22%2C%22%24latest\_utm\_term%22%3A%2228701167%22%7D%2C%22%24device\_id%22%3A%2217861c0ed597c3-038f16a76b7241-7373e61-2073600-17861c0ed5aae6%22%7D; Hm\_lvt\_38ba284938d5eddca645bb5e02a02006=1616547737,1616835569; sts\_evtseq=2; ZL\_REPORT\_GLOBAL={%22/resume/new%22:{%22actionid%22:%220581e348-6ff3-47a2-9c46-adc4b33a299e%22%2C%22funczone%22:%22addrsm\_ok\_rcm%22}}; acw\_tc=2760828b16168355886785280e5895223771a1826f480f902918aba10f19b7; d4d6cd0b4a19fa72b8cc377185129bb7=f2d20fdf-aca1-47d5-9419-68a1546651e1; zpfe\_probe\_token=7e4632aes7eed04bb5852d2b16abd20f2480; Hm\_lpvt\_38ba284938d5eddca645bb5e02a02006=1616835599; FSSBBIl1UgzbN7N443T=5yfdFrb7rjkmpYEstB19g1UVEj9YDb4LJm9L80MbHBMebRO\_SeWcfjNo5j.peEtPAeKPKHUGCLZe28BXFiOi3vQYUGrsICs34JIGqAiY72SywAP0Gs.QTTm8iMdbgqKIKKHnuiMJUeztKS64vZRt6g2PeHj1hkinnFErcamuUhf7EBxP34L9oXRYLCdIRAixDuMhcTOwhxVurVUlQmvGOZ1tiflELTVw..OLkO8esROa.LEKP8AvE\_ANpsReRVNz4RveDVngNgZTZ1Zq0fYffsv76AJkEmQuylNIF14LjxqNdVHjOHzBu7TFrL08ID1U3\_515gBgJ5Gd3cw\_6g2aXhXJh3WXlAFmWZe9NLyS.eplb6F9BK.J59jxPd.4XEzRq0La",
  8     'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
  9 }
 10 def reqdata(): 11     position\_name\_data = \[\] 12     wage\_range\_data = \[\] 13     region\_data = \[\] 14     working\_years\_data = \[\] 15     education\_requirements\_data = \[\] 16     enterprise\_name\_data = \[\] 17     enterprise\_nature\_data = \[\] 18     enterprise\_scale\_data = \[\] 19     all\_list = \[\] 20     for num in range(1,10):
 21         url1 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2029'.format(num)
 22         all\_list.append(url1)
 23         url2 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2036'.format(num)
 24         all\_list.append(url2)
 25         url3 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2035'.format(num)
 26         all\_list.append(url3)
 27         url4 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2026'.format(num)
 28         all\_list.append(url4)
 29         url5 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2019'.format(num)
 30         all\_list.append(url5)
 31         url6 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2030'.format(num)
 32         all\_list.append(url6)
 33         url7 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2023'.format(num)
 34         all\_list.append(url7)
 35         url8 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2032'.format(num)
 36         all\_list.append(url8)
 37         url9 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2028'.format(num)
 38         all\_list.append(url9)招聘信息汇总表 (1).xlsx
 39         url10 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2031'.format(num)
 40         all\_list.append(url10)
 41         url11 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2024'.format(num)
 42         all\_list.append(url11)
 43         url12 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2034'.format(num)
 44         all\_list.append(url12)
 45         url13 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2033'.format(num)
 46         all\_list.append(url13)
 47         url14 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2021'.format(num)
 48         all\_list.append(url14)
 49         url15 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2027'.format(num)
 50         all\_list.append(url15)
 51         url16 = 'https://sou.zhaopin.com/?jl=538&p={}&re=2022'.format(num)
 52         all\_list.append(url16)
 53     for url in all\_list: 54         time.sleep(1)
 55         print('\----正在下载----',url)
 56         res = requests.get(url,headers=head).text.replace('\\xa0','')
 57         #职位名称
 58         position\_name = re.findall('(?<=jobname"><span title=")(.+?)(?=" class="iteminfo)',res)
 59         for a in position\_name: 60             position\_name\_data.append(a)
 61         #薪资范围
 62         wage\_r = re.findall('(?<=<p class="iteminfo\_\_line2\_\_jobdesc\_\_salary">)(.+?)(?=<!----></p>)',res,re.S)
 63         wage\_range = \[\] 64         for i in wage\_r: 65             j = i.strip() 66             wage\_range.append(j)
 67         for b in wage\_range: 68             wage\_range\_data.append(b)
 69         # #地区
 70         region = re.findall('(?<=<li class="iteminfo\_\_line2\_\_jobdesc\_\_demand\_\_item">)(.+?)(?=</li> <li class=")',res)\[::2\]
 71         for c in region: 72             region\_data.append(c)
 73         # #工作年限
 74         working\_years = re.findall('(?<=<li class="iteminfo\_\_line2\_\_jobdesc\_\_demand\_\_item">)(.+?)(?=</li> <li class=")',res)\[1::2\]
 75         for d in working\_years: 76             working\_years\_data.append(d)
 77         # #学历要求
 78         education\_requirements = re.findall('(?<=<li class="iteminfo\_\_line2\_\_jobdesc\_\_demand\_\_item">)(.+?)(?=</li>)',res)\[2::3\]
 79         for e in education\_requirements: 80             education\_requirements\_data.append(e)
 81         # #企业名称
 82         erro\_name = re.findall('(?<=" alt=")(.+?)(?=</span>)',res)
 83         enterprise\_name = re.findall('(?<=<span title=")(.+?)(?=" class="iteminfo)',str(erro\_name))
 84         for f in enterprise\_name: 85             enterprise\_name\_data.append(f)
 86         # #企业性质
 87         enterprise\_nature = re.findall('(?<=<span class="iteminfo\_\_line2\_\_compdesc\_\_item">)(.+?)(?=</span>)',res)\[::2\]
 88         for g in enterprise\_nature: 89             enterprise\_nature\_data.append(g)
 90         # #企业规模
 91         enterprise\_scale = re.findall('(?<=<span class="iteminfo\_\_line2\_\_compdesc\_\_item">)(.+?)(?=</span>)',res)\[1::2\]
 92         for h in enterprise\_scale: 93             enterprise\_scale\_data.append(h)
 94     return position\_name\_data,wage\_range\_data,region\_data,working\_years\_data,education\_requirements\_data,enterprise\_name\_data,enterprise\_nature\_data,enterprise\_scale\_data 95 if \_\_name\_\_ == '\_\_main\_\_':
 96     (a1,a2,a3,a4,a5,a6,a7,a8) = reqdata() 97     work = openpyxl.Workbook()  # encoding='utf-8'
 98     wke = work.create\_sheet(index=0,title='招聘信息')
 99     wke.cell(row=1, column=1).value = '职位名称'
100     wke.cell(row=1, column=2).value = '薪资范围'
101     wke.cell(row=1, column=3).value = '地区'
102     wke.cell(row=1, column=4).value = '工作年限'
103     wke.cell(row=1, column=5).value = '学历要求'
104     wke.cell(row=1, column=6).value = '企业名称'
105     wke.cell(row=1, column=7).value = '企业性质'
106     wke.cell(row=1, column=8).value = '企业规模'
107     for b1,b2,b3,b4,b5,b6,b7,b8,i in zip(a1,a2,a3,a4,a5,a6,a7,a8,range(2,5000)):
108         wke.cell(row=i, column=1).value = b1
109         wke.cell(row=i, column=2).value = b2
110         wke.cell(row=i, column=3).value = b3
111         wke.cell(row=i, column=4).value = b4
112         wke.cell(row=i, column=5).value = b5
113         wke.cell(row=i, column=6).value = b6
114         wke.cell(row=i, column=7).value = b7
115         wke.cell(row=i, column=8).value = b8
116     work.save('./招聘信息汇总表.xlsx')
117 
118 import pandas as pd
119 import matplotlib.pyplot as plt
120 import matplotlib as mpl
121 
122 preffered\_foot = list(pd.read\_excel(r'招聘信息汇总表.xlsx')\['地区'\])
123 foot = \['上海','上海-松江区','上海-徐汇区','上海-长宁区','上海-普陀区','上海-虹口区','上海-崇明区','上海-杨浦区','上海-金山区','上海-黄浦区','上海-闵行区','上海-宝山区','上海-嘉定区','上海-浦东新区','上海-青浦区','上海-静安区','上海-奉贤区'\]
124 #counts = \[preffered\_foot.count('Right'), preffered\_foot.count('Left')\]
125 counts = \[preffered\_foot.count('上海'),preffered\_foot.count('上海-松江区'),preffered\_foot.count('上海-徐汇区'),preffered\_foot.count('上海-长宁区'),preffered\_foot.count('上海-普陀区'),preffered\_foot.count('上海-虹口区'),preffered\_foot.count('上海-崇明区'),preffered\_foot.count('上海-杨浦区'),preffered\_foot.count('上海-金山区'),preffered\_foot.count('上海-黄浦区'),preffered\_foot.count('上海-闵行区'),preffered\_foot.count('上海-宝山区'),preffered\_foot.count('上海-嘉定区'),preffered\_foot.count('上海-浦东新区'),preffered\_foot.count('上海-青浦区'),preffered\_foot.count('上海-静安区'),preffered\_foot.count('上海-奉贤区')\]
126 
127 # 设置中文显示
128 mpl.rcParams\['font.family'\] = 'SimHei'
129 # 设置大小  像素
130 plt.figure(figsize=(9, 6), dpi=100)
131 plt.axes(aspect='equal')   # 保证饼图是个正圆
132 #explodes = \[0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6\]
133 #exploades = \[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\]
134 color = \['red', 'y','c','b','cyan','#FF69B4','#FFB6C1','#6B4226','yellow','#E47833','greenyellow','#545454','#FF00FF','#32CD99','#00FFFF','#545454','#B5A642'\]
135 # 绘制饼图
136 # x:统计数据   explode:是否突出显示    label:标签  color:自定义颜色
137 # autopct:设置百分比的格式,保留2位小数  shadow:  有阴影  看起来立体
138 # startangle:初始角度 可使饼图旋转      labeldistance:标签离圆心的位置
139 plt.pie(counts, labels=foot,colors=color, autopct='%.2f%%', shadow=True,startangle=30, labeldistance=1.1,)
140 plt.title('职位地区分布饼状图', fontsize=15)
141 plt.savefig(fname="职位地区分布饼状图.png")
142 plt.show()
143 
144 import pandas as pd
145 import matplotlib.pyplot as plt
146 import matplotlib.gridspec as gridspec
147 import matplotlib as mpl
148 from matplotlib.ticker import FuncFormatter
149 plt.rcParams\['font.sans-serif'\] = \['Microsoft YaHei'\]  # 设置字体,否则中文会显示异常
150 df = pd.read\_excel(r"招聘信息汇总表.xlsx")
151 height = df\['工作年限'\].value\_counts()
152 skill = \[f'{m}' for m in height.index\]
153 counts = height.sort\_index()
154 
155 plt.figure(figsize=(12, 5), dpi=100)
156 # 设置图形显示风格
157 plt.style.use('ggplot')
158 plt.plot(skill\[::-1\], counts\[::-1\],linewidth=8,color='y',marker='o',
159 markerfacecolor='blue',markersize=12)
160 def to\_percent(temp, position):
161     return '%1.0f' % (5 \* position) + '%'
162 plt.gca().yaxis.set\_major\_formatter(FuncFormatter(to\_percent))
163 
164 plt.title('工作年限要求折线图')
165 plt.xlabel('工作年限')
166 plt.ylabel('所占百分比')
167 plt.savefig(fname="工作年限要求折线图.png")
168 plt.show()
169 
170 import pandas as pd
171 import matplotlib.pyplot as plt
172 import matplotlib as mpl
173 from matplotlib.ticker import FuncFormatter
174 plt.rcParams\['font.sans-serif'\] = \['Microsoft YaHei'\]  # 设置字体,否则中文会显示异常
175 df = pd.read\_excel(r"招聘信息汇总表.xlsx")
176 skill\_count = df\['学历要求'\].value\_counts()
177 skill = \[f'{m}' for m in skill\_count.index\]    # 列表推导式构造不同技术等级
178 counts = skill\_count.values.tolist()               # 技术等级对应人数统计的列表
179 # 设置中文显示
180 mpl.rcParams\['font.family'\] = 'SimHei'
181 # 设置大小  像素
182 plt.figure(figsize=(9, 6), dpi=100)
183 # 绘制水平柱状图
184 plt.barh(skill\[::-1\], counts\[::-1\], height=0.5, color='#4169E1')
185 plt.title('学历要求柱状图')
186 plt.xlabel('人数所占百分比')
187 def to\_percent(temp, position):
188     return '%1.0f' % (0.02 \* temp) + '%'
189 plt.gca().xaxis.set\_major\_formatter(FuncFormatter(to\_percent))
190 plt.yticks(\['学历不限','初中及以下','中专/中技','高中','大专','本科','硕士','MBA/EMBA','博士'\])
191 plt.savefig(fname="学历要求柱状图.png")
192 plt.show()

在这里插入图片描述

本文转自 https://www.cnblogs.com/XL2COWARD/p/14549720.html,如有侵权,请联系删除。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值