采集指定网址数据函数

from bs4 import BeautifulSoup
from mysql import *
import requests
import time
import datetime


def caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, urlbase):
    """通过id采集相应数据并存入数据库"""
    t0 = time.perf_counter()
    n = 0
    for sp in caiji_byurl(url, soupselect):
        cdate = sp.select(dateselect)[0].text.strip()[:10].strip()
        ctitle = sp.select(titleselect)[0].text.strip()
        url = urlbase + sp.select(urlselect)[0].attrs['href'].strip()
        n += caiji_save(cid, cdate, ctitle, url)

    t2 = time.perf_counter()
    print("采集ID", cid, "共", n, "条记录", "耗时:", t2 - t0)


def caiji_byid_imf(cid, url, soupselect, urlbase):
    """通过id采集相应数据并存入数据库,节点为平行结构"""
    t0 = time.perf_counter()
    n = 0
    for sp in caiji_byurl(url, soupselect):
        cdate = sp.parent.next_sibling.select("span")[0].text.strip()
        cdate =str(datetime.datetime.strptime(cdate, '%B %d , %Y'))
        ctitle = sp.text.strip()
        url = urlbase + sp.attrs['href']
        n += caiji_save(cid, cdate, ctitle, url)

    t2 = time.perf_counter()
    print("采集ID", cid, "共", n, "条记录", "耗时:", t2 - t0)


def caiji_save(cid, cdate, ctitle, url):
    sql = "select count(*) from caiji where cid=%d and cdate=%s and ctitle=%s " % (
        cid, repr(cdate), repr(ctitle))
    if (int(querysql(sql)[0][0]) == 0):
        sql = 'insert into caiji(cid,cdate,ctitle,url) values(%d,%s,%s,%s)' % (
            cid, repr(cdate), repr(ctitle), repr(url))
        cmdsql(sql)
        print(str(cid), cdate, ctitle, url)
        return 1
    else:
        return 0


def caiji_byurl(url, soupselect):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    html = requests.get(url, headers=headers).content
    soup = BeautifulSoup(html, "lxml").select(soupselect)
    return soup


def caiji_zhongguooumengshanghui():
    """采集中国欧盟商会信息"""
    cid = 1
    url = "https://www.europeanchamber.com.cn/en/press-releases"
    dateselect = ".chapter-category"
    titleselect = "h3 a"
    urlselect = "h3 a"
    urlbase = "https://www.europeanchamber.com.cn"
    soupselect = ".panel-default"
    caiji_byid(cid=cid, url=url, soupselect=soupselect, dateselect=dateselect, titleselect=titleselect,
               urlselect=urlselect, urlbase=urlbase)


def caiji_imfnews():
    """采集IFM最新信息"""
    cid = 2
    url = "https://www.imf.org/external/what/whatsnewenglish/what.aspx"
    soupselect = "#content h4 a"
    urlbase = "https://www.imf.org/"
    caiji_byid_imf(cid, url, soupselect, urlbase)


caiji_zhongguooumengshanghui()
caiji_imfnews()

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值