Python爬取相关网站数据(已经函数化处理)

from bs4 import BeautifulSoup
from MysqlTest import *
import requests
import time
import datetime

#是否打印相关信息
isprint = 0

def caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, urlbase ,datetype = '%Y-%m-%d'):
    """通过id采集相应数据并存入数据库"""
    t0 = time.perf_counter()
    n = 0
    for sp in caiji_byurl(url, soupselect):
        cdate = sp.select(dateselect)[0].text.strip().strip()
        cdate = str(datetime.datetime.strptime(cdate, datetype))
        ctitle = sp.select(titleselect)[0].text.strip()
        url = urlbase + sp.select(urlselect)[0].attrs['href'].strip()
        if(isprint == 1):
            print(cid, cdate, ctitle, url)
        n += caiji_save(cid, cdate, ctitle, url)

    print_results(cid, n, t0)

def caiji_byid_imf(cid, url, soupselect, urlbase):
    """通过id采集相应数据并存入数据库,节点为平行结构"""
    t0 = time.perf_counter()
    n = 0
    for sp in caiji_byurl(url, soupselect):
        cdate = sp.parent.next_sibling.select("span")[0].text.strip()
        cdate = str(datetime.datetime.strptime(cdate, '%B %d , %Y'))
        ctitle = sp.text.strip()
        url = urlbase + sp.attrs['href']
        if (isprint == 1):
            print(cid, cdate, ctitle, url)
        n += caiji_save(cid, cdate, ctitle, url)

    print_results(cid, n, t0)


def caiji_save(cid, cdate, ctitle, url):
    sql = "select count(*) from caiji where cid=%d and cdate=%s and ctitle=%s " % (
        cid, repr(cdate), repr(ctitle))
    if (int(querysql(sql)[0][0]) == 0):
        sql = 'insert into caiji(cid,cdate,ctitle,url) values(%d,%s,%s,%s)' % (
            cid, repr(cdate), repr(ctitle), repr(url))
        cmdsql(sql)
        print(str(cid), cdate, ctitle, url)
        return 1
    else:
        return 0


def caiji_byurl(url, soupselect):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
    }
    html = requests.get(url, headers=headers).content
    soup = BeautifulSoup(html, "lxml").select(soupselect)
    return soup


def caiji_zhongguooumengshanghui():
    """采集中国欧盟商会信息"""
    """"""
    cid = 1
    url = "https://www.europeanchamber.com.cn/en/press-releases"
    dateselect = ".chapter-category"
    titleselect = "h3 a"
    urlselect = "h3 a"
    urlbase = "https://www.europeanchamber.com.cn"
    soupselect = ".panel-default"
    caiji_datelen(cid=cid, url=url, soupselect=soupselect, dateselect=dateselect, titleselect=titleselect,
               urlselect=urlselect, urlbase=urlbase)


def caiji_datelen(cid, url, soupselect, dateselect, titleselect, urlselect, urlbase ,datetype = '%Y-%m-%d', datelen=10):
    """通过id采集相应数据并存入数据库"""
    t0 = time.perf_counter()
    n = 0
    for sp in caiji_byurl(url, soupselect):
        cdate = sp.select(dateselect)[0].text.strip()[:datelen].strip()
        cdate = str(datetime.datetime.strptime(cdate, datetype))
        ctitle = sp.select(titleselect)[0].text.strip()
        url = urlbase + sp.select(urlselect)[0].attrs['href'].strip()
        if (isprint == 1):
            print(cid, cdate, ctitle, url)
        n += caiji_save(cid, cdate, ctitle, url)

    print_results(cid, n, t0)


def caiji_imfnews():
    """采集IFM最新信息"""
    """日期为平行结构"""
    cid = 2
    url = "https://www.imf.org/external/what/whatsnewenglish/what.aspx"
    soupselect = "#content h4 a"
    urlbase = "https://www.imf.org"
    caiji_byid_imf(cid, url, soupselect, urlbase)


def caiji_unctad():
    """采集联合国贸发会议新闻"""
    cid = 3
    url = "https://unctad.org/en/Pages/Home.aspx"
    soupselect = "#container1 .row"
    urlbase = "https://unctad.org"
    dateselect = "div p b"
    titleselect = "a span"
    urlselect = "div div a"
    datetype = "%d %B %Y"
    caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, urlbase, datetype)



def caiji_OECD_pre():
    """采集OECD PRE新闻"""
    cid = 4
    url = "https://www.oecd-ilibrary.org/search?value51=%27igo%2Foecd%27&sortDescending=false&value5=30191110114407&operator51=AND&value1=subtype%2Fissue+OR+subtype%2Fbook&value4=20191110114407&option5=sortDate_to&value3=status%2F50embargoDate&publisherId=%2Fcontent%2Figo%2Foecd&facetNames=pub_igoId_facet+pub_themeId_facet&option3=pub_contentStatus&sortField=prism_publicationDate&option4=sortDate_from&option1=dcterms_type&facetOptions=51+52&option51=pub_igoId_facet&operator52=AND&option52=pub_themeId_facet&value52=%27theme%2Foecd-79%27"
    soupselect = ".title_box"
    urlbase = "https://www.oecd-ilibrary.org"
    dateselect = ".search-metaitem + .comma_separated li"
    titleselect = ".search_title"
    urlselect = ".search_title a"
    datetype = "%d %b %Y"
    caiji_byid(cid, url, soupselect, dateselect, titleselect, urlselect, urlbase, datetype)


def print_results(cid, n, t0):
    """打印采集数据结果和所用时间"""
    t2 = time.perf_counter()
    sql = "select ctitle from leibie where cid = %d" % (cid)
    leibie = querysql(sql)[0][0]
    print("采集ID", cid, leibie, "共", n, "条记录", "耗时:", t2 - t0)


#1采集中国欧盟商会新闻
caiji_zhongguooumengshanghui()
#2采集IMF新闻
caiji_imfnews()
#3采集联合国贸发会议新闻
caiji_unctad()
#4采集OECD新闻
caiji_OECD_pre()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Python是一种强大的编程语言,非常适合用于网络爬虫,可以从各种网站抓取数据,包括电影网站。以下是使用Python爬取电影网站数据的基本步骤: 1. **选择库**: - 使用`requests`库来发送HTTP请求获取网页内容。 - `BeautifulSoup`或`Scrapy`库解析HTML结构,提取所需的数据(如电影标题、链接等)。 - 可能还需要`lxml`库,它在处理大型、复杂的HTML文档时效率更高。 2. **设置代理和反爬策略**: - 如果目标网站有反爬机制,可能需要设置代理IP,或者使用像`Scrapy-Splash`这样的工具进行渲染页面。 - 遵守网站的robots.txt规则,尊重爬虫政策。 3. **编写爬虫代码**: - 定义一个函数,接受URL作为参数,使用`requests.get()`获取网页源码。 - 使用解析库解析HTML,找到目标元素,如电影信息所在的class名、id名等。 - 存储数据,通常可以用字典或列表存储,然后写入CSV、JSON文件或数据库。 4. **循环和深度爬取**: - 对于分页的电影列表,可能需要编写循环,递归抓取下一页。 - 如果有子页面,比如电影详情页,也需要相应地处理。 5. **异常处理**: - 编写try-except块处理可能出现的网络错误、解析错误等。 6. **数据清洗和分析**: - 数据可能需要预处理,如去除HTML标签、转换为统一格式等。 相关问题-- 1. 在Python中,如何使用BeautifulSoup解析HTML? 2. 如何处理动态加载内容或使用Selenium进行模拟浏览器爬取? 3. 如何在Python爬虫中有效地处理和存储大量数据?
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值