打造一个轻量级企业基本信息采集框架（五）-CSDN博客

本文链接：https://blog.csdn.net/Since_you/article/details/105522648

前言

在上文中，我们定义了下载器，是不是对这个框架已经有了一定的认识了，不由自主的发出一句：就这？哈哈哈。今天主要介绍下解析部分，很简单啦，有请其闪亮登场。

def rphtml(self, html):
        """
        企业名称数据处理
        :param html:
        :return:
        """
        if html:
            dr = re.compile(r'<[^>]+>', re.S)
            dd = dr.sub('', html)
            return dd
        else:
            return html


    def collect_data(self,data,keyword,page=1):
        """
        基本信息采集
        :param data: 参数
        :param keyword: 关键词
        :param page: 页数，默认一页
        :return: sql params
        """
        time.sleep(random.randint(1,2))
        response = req_data(self.eplisturl, data, page,keyword)
        if response:
            json_value = json.loads(response)
            eplistarray = json_value["data"]["result"]["data"]
            if eplistarray:
                logger.info("目前正在采集~~~~~" + keyword + "的第" + str(page) + "页")
                for eplist in eplistarray:
                    guid = str(uuid.uuid4())
                    entName = self.rphtml(eplist['entName'])
                    pripid = eplist['pripid']
                    regNo = eplist['regNo']
                    uniscId = eplist['uniscId']
                    legelRep = eplist['legelRep']
                    entTypeCn = eplist['entTypeCn']
                    corpStatusString = eplist['corpStatusString']
                    estDate = eplist['estDate']
                    regOrg = eplist['regOrg']
                    # regCap = eplist['regCap']
                    busExceptCount = eplist['busExceptCount']
                    illCount = eplist['illCount']
                    nodeNum = eplist['nodeNum']
                    historyName = self.rphtml(eplist['historyName'])
                    entType = eplist['entType']
                    params = {'guid': guid, 'entName': entName, 'pripid': pripid,'regNo': regNo, 'uniscId': uniscId, 'legelRep': legelRep,'entTypeCn': entTypeCn,'corpStatusString': corpStatusString, 'estDate': estDate,'regOrg': regOrg, 'busExceptCount': busExceptCount,'illCount': illCount,'nodeNum': nodeNum, 'historyName': historyName,'entType': entType,'json_data':response}
                    self.md5.update(str(params).encode("utf-8"))
                    md5data = self.md5.hexdigest()
                    if md5data in self.set_list:
                        continue
                    self.set_list.add(md5data)
                    self.dbcollect.insert_basicsql("qggs_app_collectlist",params)
                    logger.info("成功采集^^^" + entName + "的基本信息...")
                    yield params

                recordsTotal = json_value['data']['result']['recordsTotal']  # 列表记录数，判断翻页
                logger.info("关键词为^^^" + keyword + "一共" + str(recordsTotal) + "条数据...")
                if recordsTotal / 10 > page:
                    page += 1
                    for req_param in self.collect_data(data, keyword,page):
                        yield req_param

            self.client.set_success(keyword)
        else:
            self.client.set_failure(keyword)
            return

第一个rphtml()函数是对返回值做一个数据处理，简单的正则应用。
主体部分还是collect_data()函数，我们将返回回来的值做json解析，这边推荐大家一个在线的json解析网站，很好用json解析。我们将获取的数据打包成一个字典，传给存储部分，因为采集的是一个列表页，可能存在翻页，我们将其数据再yield返回，进行多页采集。对于上文中提到的请求5次还是失败的keyword，进行zset集合内的status改为2，成功的改为1，后面就不会采集到了。还记得我们调度器那边有一块代码么，我没讲，将其跳过，这个时候就发挥作用了。

if keyword_list:
	for params in CrawlUrl().get_params(keyword_list):
    	self.cut.cutentername(params["entName"]) # 分词

这边就是对采集到的的企业进行分词，然后再将其加入到redis集合中
我们看下分词这边的代码。

class CutEntername:
    def __init__(self):
        self.client = RedisDbConn()

    def cutentername(self,entername):
        """
        分词
        :param entername:
        :return: keyword
        """

        seg_list = jieba.cut(entername)
        logger.info("正在分词------" + entername)
        c = Counter()
        for x in seg_list:
            if len(x) > 1 and x != '\r\n':
                c[x] += 1
        for (keyword, v) in c.most_common():
            if isinstance(keyword,str):
                if not self.client.exists_keyword(keyword):
                    self.client.initial_set(keyword)