将爬取列表页和单页的两个py文件合并,并且将python3代码转换成python2.7

更改的地方:合并两个test.py并且修改,更改gs_extractor.py,合并page-config.json和list-config.json为config.json并修改。

简单思路:先将合并后test文件的python3代码调试成功,保留一份,然后再去修改python2.7代码。

代码中遇到的问题以及参考的的知识记录:

在python3中encoding=“utf-8"这个参数是允许的,而在python2.7中是没有这个参数的。

 fileObj = open(filePath, 'r',encoding="utf-8")  ####这里去掉了encoding="utf-8"

http://www.codexiu.cn/python/blog/31878/         实例化一个GsExtractor对象    
http://blog.51cto.com/gooseeker/1775055          1分钟快速生成用于网页内容提取的xslt
https://blog.csdn.net/weixin_35955795/article/details/52658815     python中的enumerate语句
https://blog.csdn.net/samsam2013/article/details/78354484           Python的hasattr() getattr() setattr() 函数使用方法详解
https://blog.csdn.net/huangpin815/article/details/70495906            datetime.datetime.strptime:万能的日期格式转换

https://www.cnblogs.com/itfat/p/7481972.html   
Python模块详解以及import本质,获得文件当前路径os.path.abspath,获得文件的父目录os.path.dirname,放到系统变量的第一位sys.path.insert(0,x)模块介绍
https://www.cnblogs.com/kakaln/p/8192957.html    pycharm多行代码缩进、左移
https://blog.csdn.net/sgfmby1994/article/details/77876873/    在pycharm中切换python版本的方法
https://jingyan.baidu.com/article/48b558e3ffeb667f39c09a41.html  如何在pycharm中添加python虚拟环境
https://jingyan.baidu.com/article/fdbd4277a39a12b89e3f4895.html   python2、python3怎样创建虚拟环境?


UnicodeEncodeError: 'ascii' codec can't encode characters in position?

答: 在开头加上
 import sys
 reload(sys)
 sys.setdefaultencoding( “utf-8” )

下面是完成后的代码和之前的代码。后面会上传到github上。

 

 

 

 

 

 

 

 

 

完成后的代码

# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import re
import os
import sys
import json
import time
import requests
import hashlib
import datetime
from gs_extractor import GsExtractor
# from datetime import datetime
from lxml import etree

row_column = ["title", "source", "date", "url", "body", "subTitle", "pageViews", "crawledTime"] ##
base_dir = os.path.join(os.path.dirname(__file__), '.')
def loadTestConfig():
    filePath = os.path.dirname(os.path.abspath(__file__)) + "/config.json"
    fileObj = open(filePath, 'r')  ####这里去掉了encoding="utf-8"
    data = fileObj.read()
    return json.loads(data)

#####

# def load_test_config(base_dir):
#     with open(os.path.join(base_dir, 'config.json'), 'r') as f:
#         con = json.load(f)
#     return con
    
def getDataByRe(data, searchStr):
    regex = re.compile(searchStr, re.DOTALL)
    return regex.findall(data)

def getDataBySelector(data, searchStr):
    return data.cssselect(searchStr)
    
def test():
    config = loadTestConfig()
    filePath = os.path.dirname(os.path.abspath(__file__)) + "/" + config["list-rule"]
    fileObj = open(filePath, 'r')  ####同上
    data = fileObj.read()
    rule = json.loads(data)
    startDate = datetime.datetime.strptime(config["start-date"], "%Y%m%d%H%M%S")
    endDate = datetime.datetime.strptime(config["end-date"], "%Y%m%d%H%M%S")
    testListRule(config["url"], startDate, endDate, rule, {'attrs':{}}, None)
    ####
    # config = load_test_config()
    shtml = request_url(config['page-url'])
    xml = convert(config['page-rule'], shtml)
    convertToJSON(config['page-url'], str(xml))

def testListRule(url, startDate, endDate, rule, item, lastURLMD5):
    print(url)
    time.sleep(1)
    headers = {
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
    }

    requests.adapters.DEFAULT_RETRIES = 5
    s = requests.session()
    s.headers.update(headers)
    s.keep_alive = False
    r = s.get(url, allow_redirects=False)
    if r.encoding == 'ISO-8859-1':
        r.encoding = r.apparent_encoding

    # get item links
    isOldContent = False
    itemList = []
    dateList = []
    selectedDateList = []

    if r.text == '' or r.text is None:
        return

    shtml = etree.HTML(r.text)

    if rule["itemURLReg"]["type"] == "cssselect":
        itemList = getDataBySelector(shtml, rule["itemURLReg"]['value'])
    elif rule["itemURLReg"]["type"] == "xpath":
        itemList = shtml.xpath(rule["itemURLReg"]['value'])
    
    if rule["itemDateReg"]["type"] == "cssselect":
        dateList = shtml.cssselect(rule["itemDateReg"]['value'])
    elif rule["itemDateReg"]["type"] == "regex":
        dateList = getDataByRe(r.text, rule["itemDateReg"]['value'])
    elif rule["itemDateReg"]["type"] == "xpath":
        dateList = shtml.xpath(rule["itemDateReg"]['value'])
    
    if(len(dateList) > 0):
        for idx, date in enumerate(dateList):
            if hasattr(date, "text"):
                cleanText = date.text.strip()
            else:
                cleanText = date.strip()

            postDate = datetime.datetime.strptime(cleanText, rule["itemDateReg"]['format'])

            # for some data source without year in post date information
            if(postDate.year == 1900):
                postDate = datetime.datetime(datetime.datetime.now().year, postDate.month, postDate.day, postDate.hour, postDate.minute, postDate.second, postDate.microsecond)

            if postDate >= startDate and postDate <= endDate:
                selectedDateList.append(idx)

       # gen item request
        lengthOfItem = len(itemList)
        if lengthOfItem > 0 and len(selectedDateList) > 0:
            for idx in selectedDateList:
                if (
                        lengthOfItem > idx and 
                        itemList[idx].attrib is not None and 
                        'href' in itemList[idx].attrib.keys()
                    ):
                    print(rule["itemURLReg"]['format'].format(itemList[idx].attrib["href"]), dateList[idx])

            idx = selectedDateList[len(selectedDateList) - 1]
            lastURL = itemList[idx].attrib["href"]
            hashStr = hashlib.sha1(lastURL.encode("utf8")).hexdigest()
            if hashStr == lastURLMD5:
                isOldContent = True
            else:
                isOldContent = False
                lastURLMD5 = hashStr

        if len(selectedDateList) > 0 and isOldContent is not True:
            # find pagination
            if rule["paginationURLReg"]['type'] == "auto-increase":
                if 'pageIndex' in item['attrs'].keys():
                    step = 1
                    if "step" in rule["paginationURLReg"].keys():
                        step = rule["paginationURLReg"]["step"]

                    pageStartIndex = item['attrs']['pageIndex'] + step
                else:
                    pageStartIndex = rule["paginationURLReg"]['value']
                
                item['attrs']['pageIndex'] = pageStartIndex
                paginationURL = rule["paginationURLReg"]['format'].format(pageStartIndex)
                # print(paginationURL)
                testListRule(paginationURL, startDate, endDate, rule, item, lastURLMD5)

            elif rule["paginationURLReg"]['type'] == "cssselect":
                pageList = getDataBySelector(shtml, rule["paginationURLReg"]['value'])
                if len(pageList) > 0:
                    for pagination in pageList:
                        if pagination.attrib is not None and 'href' in pagination.attrib.keys():
                            paginationURL = rule["paginationURLReg"]['format'].format(pagination.attrib["href"])
                            # print(paginationURL)
                            testListRule(paginationURL, startDate, endDate, rule, item, lastURLMD5)
# @staticmethod           #此处URL接受的实参是page-url
def request_url(url):
    headers = {
                "User-Agent":"Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36"
        }
    r = requests.get(url, headers=headers)
    if r.encoding == 'ISO-8859-1':
        r.encoding = r.apparent_encoding
        # return r.content.decode('utf8')
    return r.text            
def convert(rule_file, html):
        gs_extractor = GsExtractor()
        gs_extractor.setXsltFromFile(os.path.join(base_dir, rule_file))
        result_xml = gs_extractor.extract(etree.HTML(html))

        filePath = os.path.join(base_dir, 'result.xml')
        result_xml.write(filePath, encoding="utf8")

        # with open(os.path.join(self.base_dir, 'result.xml'), 'w') as f:
        #     f.write(str(result_xml))
        # print("convert finish\nfile saved to \"resolved.xml\"")

        return result_xml
def fineText(rawText):
        rawText = rawText.replace('\n', ' ').replace('\r', ' ').replace('\f', ' ').replace('\t', ' ')
        regex = re.compile("\s{2,}")
        rawText = regex.sub(" ", rawText)
        return rawText
#此处接受的实参也为page-url
def convertToJSON(url, xml):
    raw_obj = {
        "url": url,
        "crawledTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    xml_root = etree.fromstring(xml)
    item_root = xml_root.find('item')
    for child in item_root:
        if child.text is not None:
            if 'regex' in child.attrib.keys():
                regex = re.compile(child.attrib["regex"])
                result = regex.findall(child.text)
                if len(result) > 0:
                    if 'format' in child.attrib.keys():
                        raw_obj[child.tag] = child.attrib["format"].format(result[0])
                    else:
                        raw_obj[child.tag] = result[0]
                else:
                    raw_obj[child.tag] = None
            else:
                raw_obj[child.tag] = fineText(child.text.strip())
        elif len(child) > 0:
            raw_obj[child.tag] = []
            for subchild in child:
                raw_obj[child.tag].append(subchild.text)
        else:
            raw_obj[child.tag] = None
            # print(u"{}-{}".format(child.tag, raw_obj[child.tag]))

    with open(os.path.join(base_dir, 'result.json'), 'w') as f:           ##同上
        f.write(json.dumps(raw_obj, ensure_ascii=False))
test()

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值