执行原理:
1)连接数据库,读取指定数量的文章,并按照标点符号,将文章拆分成句子(找出符合条件的句子,我这里设置的是超过12个字符的句子,抽取2条)
2)将上述句子分别通过百度搜索接口进行查询,查找搜索结果top10中完整包含该句子的数量,如果句子的重复度低于30%(1个句子是3条重复,2个句子就是6条重复),也就是低于6个句子重复,那就提示内容重复度较低,符合要求
备注1:在:本脚本同路径下放置cookie.txt,可以放入多个百度账号登录后的cookie,轮循使用,以防止爬取频次过快,而被禁止访问
备注2:可以考虑使用动态IP进行爬取,可以大大降低无法正常抓取的问题,我这里使用的是阿布云,1条IP隧道1元/小时,自己可以根据自己需求自行考虑是否使用
#coding:utf-8
import requests,re,time,sys,json,datetime
import multiprocessing
import pymysql as mdb
import re
from random import choice
current_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
cookie = [line.strip() for line in open('cookie.txt', encoding='utf-8')]
def search(req,html):
text = re.search(req,html)
if text:
data = text.group(1)
else:
data = 'no'
return data
def date(timeStamp):
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
def getHTml(url):
host = search('^([^/]*?)/',re.sub(r'(https|http)://','',url))
headers = {
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6",
"Cache-Control":"no-cache",
"Connection":"keep-alive",
"Cookie":choice(cookie),
"Host":host,
"Pragma":"no-cache",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
}
# 代理服务器
proxyHost = "http-dyn.abuyun.com" # 使用的阿布云动态隧道
proxyPort = "9020"
# 代理隧道验证信息
proxyUser = "xxxxx"
proxyPass = "xxxxx"
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host" : proxyHost,
"port" : proxyPort,
"user" : proxyUser,
"pass" : proxyPass,
}
proxies = {
"http" : proxyMeta,
"https" : proxyMeta,
}
html = requests.get(url,headers=headers,timeout=30)
# html = requests.get(url,headers=headers,timeout=30,proxies=proxies)
code = html.encoding
status_code = html.status_code
# print(status_code)
# time.sleep(1) # 防止抓取过快而被限制
return html.content
def getContent(word):
pcurl = 'http://www.baidu.com/s?q=&tn=json&ct=2097152&si=&ie=utf-8&cl=3&wd=%s&rn=10' % word
print('start crawl %s' % word)
html = getHTml(pcurl)
a = 0
try:
html_dict = json.loads(html)
for tag in html_dict['feed']['entry']:
if 'title' in tag:
title = tag['title']
url = tag['url']
rank = tag['pn']
time = date(tag['time'])
abs = tag['abs']
abs = re.sub(r',|,|?|\?|!|!|。|:|:', '', abs, re.I|re.S) # 去除搜索结果文章对应的摘要里面的标点符号,防止一些伪原创文章断句
if word in abs:
a += 1
except json.decoder.JSONDecodeError:
print('>>>> 抓取页面错误')
a = a
getContent(word)
finally:
return a
con = mdb.connect('127.0.0.1','root','root','seo',charset='utf8')
cur = con.cursor()
with con:
cur.execute("select pid,duanluo from luke_caiji limit 30") # 限制调用文章的数量
numrows = int(cur.rowcount)
for i in range(numrows):
row = cur.fetchone()
aid = row[0]
content = row[1]
content_format = re.sub(']*?>','',content)
a = 0
list_sentence = re.split(r',|,|?|\?|!|!|。|:|:', content_format)
for z in [ x for x in list_sentence if len(x)>12 ][:2]: # 此处限制每篇查询几句
a += getContent(z)
if a <=6:
print("%s --> %s 【文章重复度较低,可考虑采用】\n" % (aid,a))
else:
print("%s --> %s\n" % (aid,a))
# words = open(wordfile).readlines()
# pool = multiprocessing.Pool(processes=10)
# for word in words:
# word = word.strip()
# pool.apply_async(getContent, (word,client ))
# pool.close()
# pool.join()
本文代码及思路参考的是闯哥的文章,《SEO文章原创度检测》,闯哥用的是python2,我这里改成python3,并就一些参数和异常处理做了优化