import re
import os
import sys
import json
import nltk
import time
import pickle
import random
import base64
import datetime
import requests
import openpyxl
import readline
import itertools
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm, trange
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from collections import Counter
from pypinyin import lazy_pinyin, Style
from joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.width', 500)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 1000)
# 'sfzmsy' ,'sfzmsf' ,'sfdsxtysmyq' ,'sfrhdz' ,'sfxgrh' ,'sfzcgchfwq'
# 算法怎么使用
# 算法怎么收费
# 算法对摄像头有什么要求
# 算法如何定制
# 算法效果如何
# 是否支持国产化部署
result = []
def parse_html_element( html_url ):
sfmc, sfms, sffl, yycj, cjwt = '', '', '', '', ''
datas = requests.request( 'GET', html_url )
if datas.status_code == 200:
datas_html = BeautifulSoup(datas.text, 'lxml')
sfmc = datas_html.find(attrs={"class": "cont-box"}).find(attrs={"class": "h1 title type-a color2"}).text # 算法名称
sfms = datas_html.find(attrs={"class": "cont-box"}).find(attrs={"class": "des type-a"}).text # 算法描述
sffl = datas_html.find(attrs={"class": "cont-box"}).find(attrs={"class": "tag-list"}).text # 算法分类
# 应用场景 & 常见问题
yycj_cjwt_lis = [yc.text.strip() for yc in datas_html.find(attrs={"class": "algorithm-info-top"}).find_all(attrs={"class": "ul"})]
yycj, cjwt = yycj_cjwt_lis[0], yycj_cjwt_lis[1]
result.append( [ sfmc, sfms, sffl, yycj, cjwt ] )
else:
time.sleep(random.randint(1,2))
result_df = pd.DataFrame( result )
result_df.columns = [ 'sfmc', 'sfms', 'sffl', 'yycj', 'cjwt' ]
result_df.duplicated()
result_df.drop_duplicates(inplace=True)
print( len( result ) )
result_df = pd.DataFrame( result )
result_df.columns = [ 'sfmc', 'sfms', 'sffl', 'yycj', 'cjwt' ]
result_df['sfmc'] = result_df.sfmc.apply(lambda sfmc: sfmc.strip())
result_df['sffl'] = result_df.sffl.apply(lambda sffl: sffl.strip().replace('\n\n', ','))
result_df['yycj'] = result_df.yycj.apply(lambda yycj: yycj.strip().replace(' \t\n ', ':').replace('\n\n\n\n\r\n', ';').replace(' ', ''))
result_df['cjwt'] = result_df.cjwt.apply(lambda cjwt: cjwt.strip().replace(' \n ', ':').replace('\n\n\n\n\r\n', ';').replace(' ', ''))
result_df['sfzmsy'] = result_df.cjwt.apply(lambda cjwt: cjwt.split(';')[0].split(':')[-1].strip())
result_df['sfzmsf'] = result_df.cjwt.apply(lambda cjwt: cjwt.split(';')[1].split(':')[-1].strip())
result_df['sfdsxtysmyq'] = result_df.cjwt.apply(lambda cjwt: cjwt.split(';')[2].split(':')[-1].strip())
result_df['sfrhdz'] = result_df.cjwt.apply(lambda cjwt: cjwt.split(';')[3].split(':')[-1].strip() if len(cjwt.split(';'))>3 else '')
result_df['sfxgrh'] = result_df.cjwt.apply(lambda cjwt: cjwt.split(';')[4].split(':')[-1].strip() if len(cjwt.split(';'))>4 else '')
result_df['sfzcgchfwq'] = result_df.cjwt.apply(lambda cjwt: cjwt.split(';')[5].split(':')[-1].strip() if len(cjwt.split(';'))>5 else '')
result_df['sftd'] = result_df.sfmc.apply(lambda sfmc: sfmc.split(' ')[-1].strip() if len(sfmc.split(' '))>1 else '--')
result_df['sfmc'] = result_df.sfmc.apply(lambda sfmc: sfmc.split(' ')[0].strip())
result_df.to_excel(r'常用成熟-AI场景识别算法.xlsx', index=None, encoding='utf8')
result_df.to_csv(r'常用成熟-AI场景识别算法.csv', index=None, encoding='utf8')
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.