from bs4 import BeautifulSoup
from MysqlTest import *
import requests
import time
import datetime
#是否打印相关信息
isprint = 0
def caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype = '%Y-%m-%d', datelen = 0):
"""通过id采集包含结构数据并存入数据库"""
t0 = time.perf_counter()
n = 0
for sp in caiji_byurl(url, soupselect):
#如果datelen为0或为空,则不对日期进行长度处理
if(datelen == 0):
if(len(dateselect)>0):
cdate = sp.select(dateselect)[0].text.strip().strip().replace(" ","")
if(len(cdate)>4):
cdate = str(datetime.datetime.strptime(cdate, datetype))
else:
cdate = str(cdate.strip())
else:
#如果dateselect为空,则没有日期
cdate = ""
else:
if (len(dateselect) > 0):
cdate = sp.select(dateselect)[0].text.strip()[:datelen].strip()
else:
cdate = sp.text.strip()[:datelen].strip()
cdate = str(datetime.datetime.strptime(cdate, datetype))
if(len(titleselect)>0):
ctitle = sp.select(titleselect)[0].text.strip()
else:
ctitle = sp.text.strip()
if(len(urlselect)>0):
myurl = urlbase(url,sp.select(urlselect)[0].attrs['href'].strip())
else:
myurl = sp.attrs['href'].strip()
if(isprint == 1):
print(cid, cdate, ctitle, myurl)
#处理USTR301中的日期
if(cid == 6):
if(ctitle.find("Exclusions Granted")>=0 and cdate==""):
cdate = ctitle[19:].strip().replace(" ","")
cdate = str(datetime.datetime.strptime(cdate, "%B%d,%Y"))
n += caiji_save(cid, cdate, ctitle, myurl)
print_results(cid, n, t0)
def caiji_bynodes(cid, dateselect, titleselect, url, urlselect, datetype="%Y-%m-%d"):
"""通过平行结构采集网页数据"""
n = 0
t0 = time.perf_counter()
soup = mysoup(url)
dates = soup.select(dateselect)
titles = soup.select(titleselect)
urls = soup.select(urlselect)
for cdate, title, urlt in zip(dates, titles, urls):
cdate = str(datetime.datetime.strptime(cdate.text.replace(" ", ""), datetype))
ctitle = title.text.strip()
myurl = urlbase(url,urlt.get('href').strip())
if (isprint == 1):
print(cid, cdate, ctitle, myurl)
n += caiji_save(cid, cdate, ctitle, myurl)
print_results(cid, n, t0)
# def caiji_data(cid):
# """通过leibie数据库采集信息"""
# sql = "select * from leibie where cid=%d" % (cid)
#
# print(sqlfetchone(sql))
# # if(typeid==1):
# # caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype, datelen)
# # else:
# # caiji_bynodes(cid,dateselect,titleselect,url,urlselect,datetype)
def caiji_save(cid, cdate, ctitle, url):
"""查询有无存在记录,如果没有,则存入数据库"""
#处理日期为空的情况
if(len(cdate)==0):
sql = "select count(*) from caiji where cid=%d and cdate is null and ctitle=%s " % (
cid, repr(ctitle))
else:
#处理日期只有年度的情况
if(len(cdate) == 4): cdate = cdate + '-1-1'
sql = "select count(*) from caiji where cid=%d and cdate=%s and ctitle=%s " % (
cid, repr(cdate), repr(ctitle))
cjdate = str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
if (int(querysql(sql)[0][0]) == 0):
sql = 'insert into caiji(cid,cdate,ctitle,url,cjdate) values(%d,%s,%s,%s,%s)' % (
cid, repr(cdate), repr(ctitle), repr(url), repr(cjdate))
if (len(cdate.strip()) == 0):
sql = 'insert into caiji(cid,cdate,ctitle,url,cjdate) values(%d,null,%s,%s,%s)' % (
cid, repr(ctitle), repr(url), repr(cjdate))
cmdsql(sql)
print(str(cid), cdate, ctitle, url)
return 1
else:
return 0
def caiji_byurl(url, soupselect):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}
html = requests.get(url, headers=headers).content
soup = BeautifulSoup(html, "lxml").select(soupselect)
return soup
def mysoup(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}
html = requests.get(url, headers=headers).content
soup = BeautifulSoup(html, "lxml")
return soup
def urlbase(url, urlrr):
"""获取URL根地址"""
if(urlrr.find("http")<0):
if(url.find("http")>=0):
urltmp = url.split("//")
urlr = urltmp[1].split("/")[0]
url = urltmp[0] + "//" + urlr + "/" + urlrr
return(url)
def print_results(cid, n, t0):
"""打印采集数据结果和所用时间"""
t2 = time.perf_counter()
sql = "select ctitle from leibie where cid = %d" % (cid)
leibie = querysql(sql)[0][0]
print("采集ID", cid, leibie, "共", n, "条记录", "耗时:", t2 - t0)
"""采集具体数据"""
def caiji_zhongguooumengshanghui():
"""采集中国欧盟商会信息"""
""""""
cid = 1
url = "https://www.europeanchamber.com.cn/en/press-releases"
dateselect = ".chapter-category"
titleselect = "h3 a"
urlselect = "h3 a"
soupselect = ".panel-default"
datelen = 10
caiji_byid(cid=cid, url=url, soupselect=soupselect, dateselect=dateselect, titleselect=titleselect,
urlselect=urlselect, datelen = datelen)
def caiji_unctad():
"""采集联合国贸发会议新闻"""
cid = 3
url = "https://unctad.org/en/Pages/Home.aspx"
soupselect = "#container1 .row"
dateselect = "div p b"
titleselect = "a span"
urlselect = "div div a"
datetype = "%d%B%Y"
caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)
def caiji_OECD_pre():
"""采集OECD PRE新闻"""
cid = 4
url = "https://www.oecd-ilibrary.org/search?value51=%27igo%2Foecd%27&sortDescending=false&value5=30191110114407&operator51=AND&value1=subtype%2Fissue+OR+subtype%2Fbook&value4=20191110114407&option5=sortDate_to&value3=status%2F50embargoDate&publisherId=%2Fcontent%2Figo%2Foecd&facetNames=pub_igoId_facet+pub_themeId_facet&option3=pub_contentStatus&sortField=prism_publicationDate&option4=sortDate_from&option1=dcterms_type&facetOptions=51+52&option51=pub_igoId_facet&operator52=AND&option52=pub_themeId_facet&value52=%27theme%2Foecd-79%27"
soupselect = ".title_box"
dateselect = ".search-metaitem + .comma_separated li"
titleselect = ".search_title"
urlselect = ".search_title a"
datetype = "%d%b%Y"
caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)
def caiji_imfnews():
"""采集IFM最新信息"""
"""日期为平行结构"""
cid = 2
url = "https://www.imf.org/external/what/whatsnewenglish/what.aspx"
dateselect = "#content p span"
titleselect = "h4 a"
urlselect = "h4 a"
datetype = "%B%d,%Y"
caiji_bynodes(cid, dateselect, titleselect, url, urlselect, datetype)
def caiji_WorlBank():
"""采集World Bank新闻"""
cid = 5
url = "https://openknowledge.worldbank.org/discover?scope=%2F&query=&submit="
soupselect = ".item-metadata"
dateselect = ".date-info a"
titleselect = "h4"
urlselect = "h4 a"
datetype = "%b%d,%Y"
caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)
def caiji_USTR301():
"""采集USTR 301新闻"""
cid = 6
url = "https://ustr.gov/issue-areas/enforcement/section-301-investigations/tariff-actions"
soupselect = ".content p"
dateselect = ""
titleselect = "a"
urlselect = "a"
caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect)
soupselect = ".content p a"
for urls in caiji_byurl(url,soupselect):
turl = urls.attrs['href'].strip()
soupselect = ".content p a"
dateselect = ""
titleselect = ""
urlselect = ""
caiji_byid(cid, turl, soupselect, dateselect, titleselect, urlselect)
def caiji_USTR_News():
"""采集USTR新闻"""
cid = 7
url = "https://ustr.gov/about-us/policy-offices/press-office/press-releases"
soupselect = ".listing li"
dateselect = ""
titleselect = "a"
urlselect = "a"
datetype = "%m/%d/%Y"
datelen = 10
caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype, datelen)
def caiji_AmchamChina():
"""采集中国美国商会新闻"""
cid = 8
url = "https://www.amchamchina.org/about/press-center/amcham-statement/"
soupselect = ".tag-news"
dateselect = ".date"
titleselect = ".tag-news-title h2 a"
urlselect = ".tag-news-title h2 a"
datetype = "%d%B,%Y"
caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, datetype)
if __name__ == '__main__':
#1采集中国欧盟商会新闻
caiji_zhongguooumengshanghui()
#2采集IMF新闻
caiji_imfnews()
#3采集联合国贸发会议新闻
caiji_unctad()
#4采集OECD新闻
caiji_OECD_pre()
#5 采集世界银行信息
caiji_WorlBank()
#6 采集USTR301调查相关信息
caiji_USTR301()
#7 采集USTR新闻
caiji_USTR_News()
#8 采集中国美国商会新闻
caiji_AmchamChina()