写在前面
- 数据分析场景需要该类字典数据
- 大多网站不全、未更新,脑壳疼,,,
- 废话省略
- 必备信息

爬取与解析
import re
import time
import random
import requests
import warnings
import datetime
import pandas as pd
from numpy import NAN
from datetime import datetime
from bs4 import BeautifulSoup
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows',5000)
pd.set_option('display.max_colwidth',500)
import socket
socket.setdefaulttimeout(20)
header = {'User-Agent': 'Mozilla/5.0'}
import queue
import threading
tp_url = "https://www.chahaoba.com/%E7%94%B5%E4%BF%A1%E5%8F%B7%E6%AE%B5"
response = requests.get(tp_url, headers=header)
soup = BeautifulSoup(response.content, 'html.parser')
num_segment = [ul.getText().strip().split(":") for ul in soup.find_all("li") if ":" in ul.getText()]
other = [['1740(0~5)','中国电信,卫星移动通信业务专用号段'],['1740(6~9)','工业和信息化部应急通讯保障中心,卫星移动通信业务专用号段 (用于国家应急通信需求)']
,['1740(10~12)','工业和信息化部应急通讯保障中心,卫星移动通信业务专用号段 (用于国家应急通信需求)'],['199(0~9)','中国电信,公众移动通信网网号']
,['198(0~9)','中国移动,公众移动通信网网号'],['166(0~9)','中国联通,公众移动通信网网号']
,['148(0~9)','中国移动,公众移动通信网网号(物联网业务专用号段)'],['146(0~9)','中国联通,公众移动通信网网号(物联网业务专用号段)']
,['1410(0~9)','中国电信,物联网网号'],['1440(0~9)','中国移动,物联网网号']]
num_segment += other
qx_num_segment = []
for ele in num_segment:
if ele[0].isdigit():
tele = ele[1].split(',')
qx_num_segment.append(ele+[tele[0] ,"".join(tele[1:])])
else:
seg_meta = re.findall(r"\d+\.?\d*",ele[0])
if len(seg_meta) == 3:
for s in range(int(seg_meta[1]),int(seg_meta[-1])+1):
tele = ele[1].split(',')
qx_num_segment.append([seg_meta[0] + str(s), ele[1], tele[0] ,"".join(tele[1:])])
num_segment_df = pd.DataFrame(qx_num_segment,columns=['seg','seg_details','oper_net_ch','detail'])
num_segment_df['oper_net_ch'] = num_segment_df.oper_net_ch.apply(lambda oper_net_ch : '中国'+ oper_net_ch if len(oper_net_ch)==2 else oper_net_ch)
num_segment_df.to_csv('./号段_运营商_188.csv',encoding='utf8',index=None,sep='\t')
结果展示
