# -*- coding: utf-8 -*-
import os
import re
import threading
import time,datetime
from lxml import etree
import pandas as pd
import pyodbc
import requests
from pybloom_live import BloomFilter
from fake_useragent import UserAgent
from google_translate import google_translate_EtoC as gt
DATABASE = 'China'
COUNTRY = 'China'
SOURCE='http://www.chinanews.com/'
COUNT = 5
def Bulon():
if os.path.exists('{}.blm'.format(DATABASE)):
bf =BloomFilter.fromfile(open('{}.blm'.format(DATABASE),'rb'))
else:
bf = BloomFilter(1000000,0.001)
return bf
bf=Bulon()
def save_to_sql(title,originaltitle,createtime,author,content,originalcontent,articlesource,source,label,keyword,country,url,Englishtitle,Englishcontent,details,originaldetails,Englishdetails):
conn = pyodbc.connect(r'DRIVER={SQL Server Native Client 10.0};SERVER=192.168.2.2;DATABASE=China_New;UID=sa;PWD=123456')
cursor = conn.cursor()
sql_insert="insert into yuqing (title,originaltitle,createtime,author,content,originalcontent,articlesource,source,label,keyword,country,url,Englishtitle,Englishcontent,details,originaldetails,Englishdetails) values(N'" + title + "',N'" +originaltitle + "',N'" +createtime + "',N'" +author + "',N'" +content + "',N'" +originalcontent + "',N'" +articlesource + "',N'" +source + "',N'" +label + "',N'" +keyword + "',N'" +country + "',N'" +url + "',N'" +Englishtitle + "',N'" +Englishcontent + "',N'" +details + "',N'" +originaldetails + "',N'" +Englishdetails + "')"
cursor.execute(sql_insert)
conn.commit()
conn.close()
def fanyi(line, from_lang, to_lang):
line = [line[i:i+2500] for i in range(0,len(line), 2500)]
result = ''
for i in line:
result += gt(i, from_lang, to_lang)
return result
def Translate(originaltitle,createtime,author,originalcontent,articlesource,source,label,keyword,country,url,originaldetails):
Englishdetails = fanyi(originaldetails, from_lang='auto', to_lang='en').replace("'","''")
Englishcontent = Englishdetails[:230].replace("'","''")
Englishtitle = fanyi(originaltitle, from_lang='auto', to_lang='en').replace("'","''")
details = fanyi(originaldetails, from_lang='auto', to_lang="zh-CN").replace("'","''")
content = details[:230].replace("'","''").replace("'","''")
title = fanyi(originaltitle, from_lang='auto', to_lang="zh-CN").replace("'","''")
# print('{0} 新闻时间: {1} 当前时间: {2}'.format(COUNTRY, createtime, time.ctime()))
# print('{0} 新闻标题: {1}'.format(COUNTRY, title))
save_to_sql(title,originaltitle,createtime,author,content,originalcontent,articlesource,source,label,keyword,country,url,Englishtitle,Englishcontent,details,originaldetails,Englishdetails)
def parse(COUNT, header, url):
while COUNT:
try:
response = requests.get(url, headers=header,timeout=20)
if response.status_code == 200:
return response
else:
COUNT -= 1
except:
COUNT -= 1
if COUNT == 0:
return 0
def time_stamps(line):
try:
timearray = time.strptime(line,"%Y年%m月%d日 %H:%M")
line = time.strftime("%Y/%m/%d %H:%M",timearray)
return line
except:
line = ''
def dd(url, header):
print('url is'+ url)
response = parse(COUNT, header, url)
response.encoding="gb2312"
if response:
selector = etree.HTML(response.text)
country = COUNTRY
source = 'http://www.chinanews.com/'
try:
author = selector.xpath('//div[@class="left_name"]/div[@class="left_name"]//text()')[0].strip().replace("'", "''")[:80]
author = author.split("【")
author = author[1]
author = author.split("】")
author = author[0]
except:
author = ''
try:
articlesource = selector.xpath('//div[@class="left-t"]//a/text()')[0].strip().replace("'", "''")
if len(articlesource)<5:
articlesource = selector.xpath('//div[@class="left-t"]//text()')[0].split("\u3000")
articlesource = articlesource[-1]
articlesource = articlesource.split(":")
articlesource = articlesource[-1]
else:
articlesource = articlesource
except:
articlesource = ''
# print(articlesource)
keyword = ''
try:
originaltitle = selector.xpath('//div[@id="cont_1_1_2"]//h1/text()')[0].strip().replace("'", "''")
except:
return 0
# print(originaltitle)
try:
createtime = selector.xpath('//div[@class="left-t"]//text()')[0].strip()
createtime = createtime.split("\u3000")
createtime = createtime[0]
createtime=time_stamps(createtime)
except:
return 0
try:
originaldetails = ''.join(selector.xpath('//div[@class="left_zw"]//p//text()')).strip().replace("'", "''")
originalcontent = originaldetails[:230]
except:
return 0
try:
label = fanyi(selector.xpath('//*[@id="nav"]/a[2]//text()')[0].strip().replace("'", "''"),from_lang='auto',to_lang='en')
except:
label = ''
print(label,'--',createtime,'--',url)
Translate(originaltitle, createtime, author, originalcontent, articlesource, source, label, keyword, country, url,
originaldetails)
def spider():
header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
lab=['gj','wh']
for i in range(0,len(lab)):
e = datetime.datetime.now().strftime("%Y,%m,%d").split(",")
# e = (int(e[0]),int(end[1]),int(end[2]))
begin = datetime.date(2017,7,18)
end = datetime.date(int(e[0]),int(e[1]),int(e[2]))
# end = datetime.date(2017, 5,6)
d = end
delta = datetime.timedelta(days=1)
while d >= begin:
date=()
date = d.strftime("%Y/%m%d")
print(date)
d -= delta
main_url = []
main_url = "http://www.chinanews.com/scroll-news/"+lab[i]+"/"+date+"/news.shtml"
main_response = parse(COUNT, header, main_url)
main_selector = etree.HTML(main_response.text)
urls = main_selector.xpath('//li//div[@class="dd_bt"]/a/@href')
for url in urls:
urls = SOURCE+url
if url in bf:
print('yeah_continue!!!!!!!!!!!!!!!!')
continue
else:
bf.add(urls)
bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
dd(url,header)
#url_end 被我改成urls
# thread_list=[]
# for url in url_ends:
# url_final=SOURCE+url
# if url_final in bf:
# print('yeah_continue!!!!!!!!!!!!!!!!')
# continue
# else:
# bf.add(url_final)
# bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
# t = threading.Thread(target=dd, args=(url_final, header))
# t.setDaemon(True)
# thread_list.append(t)
# if not thread_list:
# continue
# for t in thread_list:
# t.start()
# for t in thread_list:
# t.join(20)
if __name__ == '__main__':
spider()
中国新闻网
最新推荐文章于 2022-05-07 12:54:02 发布