# -*- coding: utf-8 -*-
import os
import requests
import shutil
import zipfile
import time, datetime
import traceback
import langid
import scrapy
import pandas
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from multiprocessing import Pool, cpu_count
from urllib.request import urlopen
from requests.adapters import HTTPAdapter
DATA_PATH='.'
CRAWL_START_TIME= '2021010100'
def un_zip(filename_full, savepath):
if not os.path.isdir(savepath):
os.mkdir(savepath)
zip_file = zipfile.ZipFile(filename_full)
for names in zip_file.namelist():
zip_file.extract(names, savepath)
zip_file.close()
os.remove(filename_full)
def get_gdeltData_path(filename, savepath):
month = filename.split('.')[0][0:6]
day = filename.split('.')[0][4:8]
#hour = filename.split('.')[0][8:10]
path = os.path.join(savepath, month, day)
if not os.path.isdir(path):
os.makedirs(path)
return path+'\\'
def download_lastupdate(savepath, filetype):
url = 'http://data.gdeltproject.org/gdeltv2/lastupdate-translation.txt'
html = urlopen(url).read()
filename = None
for line in str(html.decode('utf-8'))