更改的地方:合并两个test.py并且修改,更改gs_extractor.py,合并page-config.json和list-config.json为config.json并修改。
简单思路:先将合并后test文件的python3代码调试成功,保留一份,然后再去修改python2.7代码。
代码中遇到的问题以及参考的的知识记录:
在python3中encoding=“utf-8"这个参数是允许的,而在python2.7中是没有这个参数的。
fileObj = open(filePath, 'r',encoding="utf-8") ####这里去掉了encoding="utf-8"
http://www.codexiu.cn/python/blog/31878/ 实例化一个GsExtractor对象
http://blog.51cto.com/gooseeker/1775055 1分钟快速生成用于网页内容提取的xslt
https://blog.csdn.net/weixin_35955795/article/details/52658815 python中的enumerate语句
https://blog.csdn.net/samsam2013/article/details/78354484 Python的hasattr() getattr() setattr() 函数使用方法详解
https://blog.csdn.net/huangpin815/article/details/70495906 datetime.datetime.strptime:万能的日期格式转换
https://www.cnblogs.com/itfat/p/7481972.html
Python模块详解以及import本质,获得文件当前路径os.path.abspath,获得文件的父目录os.path.dirname,放到系统变量的第一位sys.path.insert(0,x)模块介绍
https://www.cnblogs.com/kakaln/p/8192957.html pycharm多行代码缩进、左移
https://blog.csdn.net/sgfmby1994/article/details/77876873/ 在pycharm中切换python版本的方法
https://jingyan.baidu.com/article/48b558e3ffeb667f39c09a41.html 如何在pycharm中添加python虚拟环境
https://jingyan.baidu.com/article/fdbd4277a39a12b89e3f4895.html python2、python3怎样创建虚拟环境?
UnicodeEncodeError: 'ascii' codec can't encode characters in position?
答: 在开头加上
import sys
reload(sys)
sys.setdefaultencoding( “utf-8” )
下面是完成后的代码和之前的代码。后面会上传到github上。
完成后的代码
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import re
import os
import sys
import json
import time
import requests
import hashlib
import datetime
from gs_extractor import GsExtractor
# from datetime import datetime
from lxml import etree
row_column = ["title", "source", "date", "url", "body", "subTitle", "pageViews", "crawledTime"] ##
base_dir = os.path.join(os.path.dirname(__file__), '.')
def loadTestConfig():
filePath = os.path.dirname(os.path.abspath(__file__)) + "/config.json"
fileObj = open(filePath, 'r') ####这里去掉了encoding="utf-8"
data = fileObj.read()
return json.loads(data)
#####
# def load_test_config(base_dir):
# with open(os.path.join(base_dir, 'config.json'), 'r') as f:
# con = json.load(f)
# return con
def getDataByRe(data, searchStr):
regex = re.compile(searchStr, re.DOTALL)
return regex.findall(data)
def getDataBySelector(data, searchStr):
return data.cssselect(searchStr)
def test():
config = loadTestConfig()
filePath = os.path.dirname(os.path.abspath(__file__)) + "/" + config["list-rule"]
fileObj = open(filePath, 'r') ####同上
data = fileObj.read()
rule = json.loads(data)
startDate = datetime.datetime.strptime(config["start-date"], "%Y%m%d%H%M%S")
endDate = datetime.datetime.strptime(config["end-date"], "%Y%m%d%H%M%S")
testListRule(config["url"], startDate, endDate, rule, {'attrs':{}}, None)
####
# config = load_test_config()
shtml = request_url(config['page-url'])
xml = convert(config['page-rule'], shtml)
convertToJSON(config['page-url'], str(xml))
def testListRule(url, startDate, endDate, rule, item, lastURLMD5):
print(url)
time.sleep(1)
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"
}
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.headers.update(headers)
s.keep_alive = False
r = s.get(url, allow_redirects=False)
if r.encoding == 'ISO-8859-1':
r.encoding = r.apparent_encoding
# get item links
isOldContent = False
itemList = []
dateList = []
selectedDateList = []
if r.text == '' or r.text is None:
return
shtml = etree.HTML(r.text)
if rule["itemURLReg"]["type"] == "cssselect":
itemList = getDataBySelector(shtml, rule["itemURLReg"]['value'])
elif rule["itemURLReg"]["type"] == "xpath":
itemList = shtml.xpath(rule["itemURLReg"]['value'])
if rule["itemDateReg"]["type"] == "cssselect":
dateList = shtml.cssselect(rule["itemDateReg"]['value'])
elif rule["itemDateReg"]["type"] == "regex":
dateList = getDataByRe(r.text, rule["itemDateReg"]['value'])
elif rule["itemDateReg"]["type"] == "xpath":
dateList = shtml.xpath(rule["itemDateReg"]['value'])
if(len(dateList) > 0):
for idx, date in enumerate(dateList):
if hasattr(date, "text"):
cleanText = date.text.strip()
else:
cleanText = date.strip()
postDate = datetime.datetime.strptime(cleanText, rule["itemDateReg"]['format'])
# for some data source without year in post date information
if(postDate.year == 1900):
postDate = datetime.datetime(datetime.datetime.now().year, postDate.month, postDate.day, postDate.hour, postDate.minute, postDate.second, postDate.microsecond)
if postDate >= startDate and postDate <= endDate:
selectedDateList.append(idx)
# gen item request
lengthOfItem = len(itemList)
if lengthOfItem > 0 and len(selectedDateList) > 0:
for idx in selectedDateList:
if (
lengthOfItem > idx and
itemList[idx].attrib is not None and
'href' in itemList[idx].attrib.keys()
):
print(rule["itemURLReg"]['format'].format(itemList[idx].attrib["href"]), dateList[idx])
idx = selectedDateList[len(selectedDateList) - 1]
lastURL = itemList[idx].attrib["href"]
hashStr = hashlib.sha1(lastURL.encode("utf8")).hexdigest()
if hashStr == lastURLMD5:
isOldContent = True
else:
isOldContent = False
lastURLMD5 = hashStr
if len(selectedDateList) > 0 and isOldContent is not True:
# find pagination
if rule["paginationURLReg"]['type'] == "auto-increase":
if 'pageIndex' in item['attrs'].keys():
step = 1
if "step" in rule["paginationURLReg"].keys():
step = rule["paginationURLReg"]["step"]
pageStartIndex = item['attrs']['pageIndex'] + step
else:
pageStartIndex = rule["paginationURLReg"]['value']
item['attrs']['pageIndex'] = pageStartIndex
paginationURL = rule["paginationURLReg"]['format'].format(pageStartIndex)
# print(paginationURL)
testListRule(paginationURL, startDate, endDate, rule, item, lastURLMD5)
elif rule["paginationURLReg"]['type'] == "cssselect":
pageList = getDataBySelector(shtml, rule["paginationURLReg"]['value'])
if len(pageList) > 0:
for pagination in pageList:
if pagination.attrib is not None and 'href' in pagination.attrib.keys():
paginationURL = rule["paginationURLReg"]['format'].format(pagination.attrib["href"])
# print(paginationURL)
testListRule(paginationURL, startDate, endDate, rule, item, lastURLMD5)
# @staticmethod #此处URL接受的实参是page-url
def request_url(url):
headers = {
"User-Agent":"Mozilla/5.0+(Windows+NT+6.2;+WOW64)+AppleWebKit/537.36+(KHTML,+like+Gecko)+Chrome/45.0.2454.101+Safari/537.36"
}
r = requests.get(url, headers=headers)
if r.encoding == 'ISO-8859-1':
r.encoding = r.apparent_encoding
# return r.content.decode('utf8')
return r.text
def convert(rule_file, html):
gs_extractor = GsExtractor()
gs_extractor.setXsltFromFile(os.path.join(base_dir, rule_file))
result_xml = gs_extractor.extract(etree.HTML(html))
filePath = os.path.join(base_dir, 'result.xml')
result_xml.write(filePath, encoding="utf8")
# with open(os.path.join(self.base_dir, 'result.xml'), 'w') as f:
# f.write(str(result_xml))
# print("convert finish\nfile saved to \"resolved.xml\"")
return result_xml
def fineText(rawText):
rawText = rawText.replace('\n', ' ').replace('\r', ' ').replace('\f', ' ').replace('\t', ' ')
regex = re.compile("\s{2,}")
rawText = regex.sub(" ", rawText)
return rawText
#此处接受的实参也为page-url
def convertToJSON(url, xml):
raw_obj = {
"url": url,
"crawledTime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
xml_root = etree.fromstring(xml)
item_root = xml_root.find('item')
for child in item_root:
if child.text is not None:
if 'regex' in child.attrib.keys():
regex = re.compile(child.attrib["regex"])
result = regex.findall(child.text)
if len(result) > 0:
if 'format' in child.attrib.keys():
raw_obj[child.tag] = child.attrib["format"].format(result[0])
else:
raw_obj[child.tag] = result[0]
else:
raw_obj[child.tag] = None
else:
raw_obj[child.tag] = fineText(child.text.strip())
elif len(child) > 0:
raw_obj[child.tag] = []
for subchild in child:
raw_obj[child.tag].append(subchild.text)
else:
raw_obj[child.tag] = None
# print(u"{}-{}".format(child.tag, raw_obj[child.tag]))
with open(os.path.join(base_dir, 'result.json'), 'w') as f: ##同上
f.write(json.dumps(raw_obj, ensure_ascii=False))
test()