多行重复代码的重构

最新推荐文章于 2024-09-05 15:23:49 发布

码农初长成

最新推荐文章于 2024-09-05 15:23:49 发布

阅读量786

点赞数

分类专栏：重构文章标签：重构 python java 重复代码的重构

本文链接：https://blog.csdn.net/qq1300375795/article/details/77963913

版权

重构专栏收录该内容

1 篇文章 0 订阅

订阅专栏

多行重复代码的重构

这个问题是在解析页面的时候出现的,代码是用python写的,其他的语言的话思路差不多,可以参考下

原先的代码:

# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem
import collections


class TencentSpider(scrapy.Spider):
    name = 'tencent'
    allowed_domains = ['tencent.com']
    baseUrl = "http://hr.tencent.com/position.php?&start="
    offset = 0
    start_urls = [baseUrl + str(offset)]


    def parse(self, response):
        """解析页面"""
        """根据xpath语法得到相应的需要解析的节点列表"""
        nodeList = response.xpath(" //tr[@class='even'] "
                                  " | "
                                  " //tr[@class='odd'] ")


        """判断是否抓取的数据是否为空(即只抓取到最后一页,最后一页后面的那些都不抓取)"""
        if nodeList:
            """循环遍历全部的节点列表然后得到每一个节点对象列表，再将的第一个对象转换成字符串保存在item中"""
            for node in nodeList:
                item = TencentItem()

                if node.xpath("./td[1]/a/text()"):
                    item["positionName"] = node.xpath("./td[1]/a/text()").extract()[0].encode("utf-8")
                else:
                    item["positionName"] = "NULL"
                if node.xpath("./td[1]/a/@href"):
                    item["positionLink"] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8")
                else:
                    item["positionLink"] = "NULL"

                if node.xpath("./td[2]/text()"):
                    item["positionType"] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")
                else:
                    item["positionType"] = "NULL"
                if node.xpath("./td[3]/text()"):
                    item["peopleNumber"] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")
                else:
                    item["peopleNumber"] = "NULL"

                if node.xpath("./td[4]/text()"):
                    item["workLocation"] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")
                else:
                    item["workLocation"] = "NULL"
                if node.xpath("./td[5]/text()"):
                    item["publistTime"] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")
                else:
                    item["publistTime"] = "NULL"

                """交给管道去处理"""
                yield item

            """获取下一页的连接"""
            self.offset += 10
            url = self.baseUrl + str(self.offset)
            yield scrapy.Request(url, callback=self.parse)

重构后的代码

# -*- coding: utf-8 -*-
import scrapy
from Tencent.items import TencentItem
import collections


class TencentSpider(scrapy.Spider):
    name = 'tencent'
    allowed_domains = ['tencent.com']
    baseUrl = "http://hr.tencent.com/position.php?&start="
    offset = 0
    start_urls = [baseUrl + str(offset)]

    def getOrderedDict(self):
        """得到存放了页面结构信息的有序字典"""
        itemMap = collections.OrderedDict()
        itemMap["positionName"] = "./td[1]/a/text()"
        itemMap["positionLink"] = "./td[1]/a/@href"
        itemMap["positionType"] = "./td[2]/text()"
        itemMap["peopleNumber"] = "./td[3]/text()"
        itemMap["workLocation"] = "./td[4]/text()"
        itemMap["publistTime"] = "./td[5]/text()"
        return itemMap

    def parse(self, response):
        """解析页面"""
        """根据xpath语法得到相应的需要解析的节点列表"""
        nodeList = response.xpath(" //tr[@class='even'] "
                                  " | "
                                  " //tr[@class='odd'] ")

        """得到需要抓取的数据的字段名和相应的xpath路径的有序字典"""
        itemMap = self.getOrderedDict()
        """得到需要抓取的数据的字段名"""
        keys = itemMap.keys()

        """判断是否抓取的数据是否为空(即只抓取到最后一页,最后一页后面的那些都不抓取)"""
        if nodeList:
            """循环遍历全部的节点列表然后得到每一个节点对象列表，再将的第一个对象转换成字符串保存在item中"""
            for node in nodeList:
                item = TencentItem()
                """具体拥有需要抓取的数据的值的节点集合"""
                """循环遍历所有的键然后取出这个键所对应的值保存在item中,如果为空的话那么就保存NULL"""
                for key in keys:
                    goalNodeList = node.xpath(itemMap.get(key))
                    if goalNodeList:
                        item[key] = goalNodeList.extract()[0].encode("utf-8")
                    else:
                        item[key] = "NULL"

                """交给管道去处理"""
                yield item

            """获取下一页的连接"""
            self.offset += 10
            url = self.baseUrl + str(self.offset)
            yield scrapy.Request(url, callback=self.parse)