中国新闻网

最新推荐文章于 2022-05-07 12:54:02 发布
qq_42717902
最新推荐文章于 2022-05-07 12:54:02 发布
阅读量294
点赞数
分类专栏： python 文章标签：爬虫网页重定向
本文链接：https://blog.csdn.net/qq_42717902/article/details/82624241
版权
python 专栏收录该内容
8 篇文章 0 订阅
订阅专栏
# -*- coding: utf-8 -*-

import os
import re
import threading
import time,datetime
from lxml import etree
import pandas as pd
import pyodbc
import requests
from pybloom_live import BloomFilter
from fake_useragent import UserAgent
from google_translate import google_translate_EtoC as gt


DATABASE = 'China'
COUNTRY = 'China'
SOURCE='http://www.chinanews.com/'
COUNT = 5

def Bulon():
    if os.path.exists('{}.blm'.format(DATABASE)):
        bf =BloomFilter.fromfile(open('{}.blm'.format(DATABASE),'rb'))
    else:
        bf = BloomFilter(1000000,0.001)
    return bf

bf=Bulon()

def save_to_sql(title,originaltitle,createtime,author,content,originalcontent,articlesource,source,label,keyword,country,url,Englishtitle,Englishcontent,details,originaldetails,Englishdetails):
    conn = pyodbc.connect(r'DRIVER={SQL Server Native Client 10.0};SERVER=192.168.2.2;DATABASE=China_New;UID=sa;PWD=123456')
    cursor = conn.cursor()
    sql_insert="insert into yuqing (title,originaltitle,createtime,author,content,originalcontent,articlesource,source,label,keyword,country,url,Englishtitle,Englishcontent,details,originaldetails,Englishdetails) values(N'" + title + "',N'" +originaltitle + "',N'" +createtime + "',N'" +author + "',N'" +content + "',N'" +originalcontent + "',N'" +articlesource + "',N'" +source + "',N'" +label + "',N'" +keyword + "',N'" +country + "',N'" +url + "',N'" +Englishtitle + "',N'" +Englishcontent + "',N'" +details + "',N'" +originaldetails + "',N'" +Englishdetails + "')"
    cursor.execute(sql_insert)
    conn.commit()
    conn.close()


def fanyi(line, from_lang, to_lang):
    line = [line[i:i+2500] for i in range(0,len(line), 2500)]
    result = ''
    for i in line:
            result += gt(i, from_lang, to_lang)
    return result


def Translate(originaltitle,createtime,author,originalcontent,articlesource,source,label,keyword,country,url,originaldetails):
    Englishdetails = fanyi(originaldetails, from_lang='auto', to_lang='en').replace("'","''")
    Englishcontent = Englishdetails[:230].replace("'","''")
    Englishtitle = fanyi(originaltitle, from_lang='auto', to_lang='en').replace("'","''")
    details = fanyi(originaldetails, from_lang='auto', to_lang="zh-CN").replace("'","''")
    content = details[:230].replace("'","''").replace("'","''")
    title = fanyi(originaltitle, from_lang='auto', to_lang="zh-CN").replace("'","''")
#    print('{0} 新闻时间: {1} 当前时间: {2}'.format(COUNTRY, createtime, time.ctime()))
#    print('{0} 新闻标题: {1}'.format(COUNTRY, title))
    save_to_sql(title,originaltitle,createtime,author,content,originalcontent,articlesource,source,label,keyword,country,url,Englishtitle,Englishcontent,details,originaldetails,Englishdetails)



def parse(COUNT, header, url):
    while COUNT:
        try:
            response = requests.get(url, headers=header,timeout=20)
            if response.status_code == 200:
                return response
            else:
                COUNT -= 1
        except:
            COUNT -= 1
        if COUNT == 0:
            return 0

def time_stamps(line):   
    try:
        timearray = time.strptime(line,"%Y年%m月%d日 %H:%M")
        line = time.strftime("%Y/%m/%d %H:%M",timearray) 
        return line
    except:           
        line = ''

def dd(url, header):
    print('url is'+ url)
    response = parse(COUNT, header, url)
    response.encoding="gb2312"
    if response:
        selector = etree.HTML(response.text)
        country = COUNTRY
        source = 'http://www.chinanews.com/'
        try:
            author = selector.xpath('//div[@class="left_name"]/div[@class="left_name"]//text()')[0].strip().replace("'", "''")[:80]
            author = author.split("【")
            author = author[1]
            author = author.split("】")
            author = author[0]
        except:
            author = ''
        try:
            articlesource = selector.xpath('//div[@class="left-t"]//a/text()')[0].strip().replace("'", "''")
            if len(articlesource)<5:
                articlesource = selector.xpath('//div[@class="left-t"]//text()')[0].split("\u3000")
                articlesource = articlesource[-1]
                articlesource = articlesource.split("：")
                articlesource = articlesource[-1]
            else:
                articlesource = articlesource         
        except:
            articlesource = ''

        # print(articlesource)
        keyword = ''
        try:
            originaltitle = selector.xpath('//div[@id="cont_1_1_2"]//h1/text()')[0].strip().replace("'", "''")
        except:
            return 0
        # print(originaltitle)

        try:
            createtime = selector.xpath('//div[@class="left-t"]//text()')[0].strip()
            createtime = createtime.split("\u3000")
            createtime = createtime[0]
            createtime=time_stamps(createtime)
        except:
            return 0


        try:
            originaldetails = ''.join(selector.xpath('//div[@class="left_zw"]//p//text()')).strip().replace("'", "''")
            originalcontent = originaldetails[:230]
        except:
            return 0
        try:
            label = fanyi(selector.xpath('//*[@id="nav"]/a[2]//text()')[0].strip().replace("'", "''"),from_lang='auto',to_lang='en')
        except:
            label = ''
        print(label,'--',createtime,'--',url)
        Translate(originaltitle, createtime, author, originalcontent, articlesource, source, label, keyword, country, url,
                  originaldetails)

def spider():
    header = {'User-Agent': UserAgent().random, 'Accept-Language': 'zh-CN,zh;q=0.9'}
    lab=['gj','wh']
    for i in range(0,len(lab)):
        e = datetime.datetime.now().strftime("%Y,%m,%d").split(",")
#        e = (int(e[0]),int(end[1]),int(end[2]))
        begin = datetime.date(2017,7,18)
        end = datetime.date(int(e[0]),int(e[1]),int(e[2]))   
#        end = datetime.date(2017, 5,6)
        d = end
        delta = datetime.timedelta(days=1)
        while d >= begin:
            date=()
            date = d.strftime("%Y/%m%d")
            print(date)
            d -= delta
            main_url = []
            main_url = "http://www.chinanews.com/scroll-news/"+lab[i]+"/"+date+"/news.shtml"                    
            main_response = parse(COUNT, header, main_url)
            main_selector = etree.HTML(main_response.text)
            urls = main_selector.xpath('//li//div[@class="dd_bt"]/a/@href')
            for url in urls:
                urls = SOURCE+url
                if url in bf:
                    print('yeah_continue!!!!!!!!!!!!!!!!')
                    continue
                else: 
                    bf.add(urls)
                    bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
                dd(url,header)

            #url_end 被我改成urls
#            thread_list=[]
#            for url in url_ends:
#                url_final=SOURCE+url
#                if url_final in bf:
#                    print('yeah_continue!!!!!!!!!!!!!!!!')
#                    continue
#                else:
#                    bf.add(url_final)
#                    bf.tofile(open('{}.blm'.format(DATABASE), 'wb'))
#                    t = threading.Thread(target=dd, args=(url_final, header))
#                    t.setDaemon(True)
#                    thread_list.append(t)
#            if not thread_list:
#                continue
#            for t in thread_list:
#                t.start()
#            for t in thread_list:
#                t.join(20)


if __name__ == '__main__':
    spider()