爬取电影标题、评论、评分(21-11-4)

功能描述:
1、爬取网页1中的电影名称、评分、简介到mysql数据库中。
2、爬取网页2中的标题、时间、正文、采集时间到mysql数据库中。
使用的技术:requests 请求、xpath解析、mysql

xpath解析语法

//
子孙节点
/
直接子节点
.
选取当前节点

选取当前节点的父节点
@
选取属性
通过 Python 的 lxml 库,利用 XPath 进行 HTML 的解析。scrapy 封装了lxml也可以导入scrapy

任务1代码实现

"""
@Description:
@Usage:
@Author: liuxianglong
@Date: 2021/11/4 下午3:44
"""

import requests
from scrapy import Selector
import pymysql
from pymysql.cursors import DictCursor

def mysql_conn():
    """开发连接库"""
    _conn = pymysql.connect(
        host='localhost',
        user='root',
        passwd='123454321',
        database='practice',
        port=3306,
        charset='utf8mb4'
    )
    _cur = _conn.cursor(DictCursor)
    return _conn, _cur


class MysqlORM(object):
    def __init__(self, conn, cur):
        self.conn = conn
        self.cur = cur

    def insert_one(self, table: str, data: dict):
        name = ','.join(data.keys())
        col = ','.join('%({})s'.format(k) for k in data.keys())
        sql = f'insert ignore into {table}({name}) values({col})'
        self.cur.execute(sql, data)
        self.conn.commit()
        rowid = self.cur.lastrowid
        print(f'{table} 插入一条数据 {rowid}')
        return rowid

    def update_one(self, table: str, data: dict, fixed: list):
        fileds = [f'{name}=%({name})s' for name in data.keys() if name not in fixed]
        where_phrase = [f'{name}=%({name})s' for name in fixed]
        where = ' and '.join(where_phrase)
        update_sql = f'update {table} set {",".join(fileds)} where {where}'
        self.cur.execute(update_sql, data)
        self.conn.commit()
        print(f'{table} 更新一条数据到 {table} 成功')


# movie_data = {
#     'title': '小兵张嘎',
#     'publish_year': '2021',
#     'description': '...',
#     'score': '9.3'
# }


resp = requests.get('https://www.imdb.com/search/title/?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1')
resp = Selector(text=resp.text)

if __name__=='__main__':
    conn, cur = mysql_conn()
    mysql_client = MysqlORM(conn, cur)

    for movie_div in resp.xpath('//div[@class="lister-list"]/div'):
        title = movie_div.xpath('.//h3/a/text()').get()
        year=movie_div.xpath('.//h3//span[2]/text()').get()
        intro=movie_div.xpath('.//p[@class="text-muted"]/text()').get()#点击get()才能获取到文本
        #print(title)
        score=movie_div.xpath('.//div[@class="ratings-bar"]//div[@name="ir"]//@data-value').get()
        movie_data={}
        movie_data['title']=title
        movie_data['publish_year']=year
        movie_data['description']=intro
        movie_data['score']=score
        mysql_client.insert_one('movie_table', movie_data)
        #print(movie_data)

任务二



from datetime import datetime
import time
import requests
from scrapy import Selector
import pymysql
from pymysql.cursors import DictCursor


def mysql_conn():
  """开发连接库"""
  _conn = pymysql.connect(
    host='localhost',
    user='root',
    passwd='123454321',
    database='practice',
    port=3306,
    charset='utf8mb4'
  )
  _cur = _conn.cursor(DictCursor)
  return _conn, _cur


class MysqlORM(object):
  def __init__(self, conn, cur):
    self.conn = conn
    self.cur = cur
  def insert_one(self, table: str, data: dict):
    name = ','.join(data.keys())
    col = ','.join('%({})s'.format(k) for k in data.keys())
    sql = f'insert ignore into {table}({name}) values({col})'
    self.cur.execute(sql, data)
    self.conn.commit()
    rowid = self.cur.lastrowid
    print(f'{table} 插入一条数据 {rowid}')
    return rowid

  def update_one(self, table: str, data: dict, fixed: list):
    fileds = [f'{name}=%({name})s' for name in data.keys() if name not in fixed]
    where_phrase = [f'{name}=%({name})s' for name in fixed]
    where = ' and '.join(where_phrase)
    update_sql = f'update {table} set {",".join(fileds)} where {where}'
    self.cur.execute(update_sql, data)
    self.conn.commit()
    print(f'{table} 更新一条数据到 {table} 成功')


if __name__=='__main__':
    conn, cur = mysql_conn()
    mysql_client = MysqlORM(conn, cur)
    resp = requests.get('https://www.who.int/emergencies/disease-outbreak-news')
    resp = Selector(text=resp.text)
    i=0;
    for dis_div in resp.xpath('//div[@class="sf-list-vertical"]/a'):
      i=i+1;
      print(i);
      p_time= dis_div.xpath('.//h4//span[2]//text()').get()[:-3]#.前一个节点
      title = dis_div.xpath('.//h4//span[3]//text()').get()
      href=dis_div.xpath('.//@href').get()
      resp_2 = requests.get(href)
      resp_2= Selector(text=resp_2.text)
      lst = []
      for cont in resp_2.xpath('//article//p'):
        for pc in cont.xpath('.//text()'):
            t=pc.get()
            lst.append(t)
      strp=''.join(lst)
      #print("正文:%s" % strp)
      dis_data = {}
      dis_data['title'] = title
      print("%s的长度%d" % (p_time,len(p_time)))
      tup = time.strptime(p_time, "%d %B %Y")
      tim = time.asctime(tup)
      dis_data['publish_time'] = tim
      dis_data['collect_time'] = time.asctime(time.localtime())
      print(dis_data['collect_time'])
      dis_data['content'] = strp
      dis_data['url']= href
      #print(dis_data)
      #print(dis_data)
      mysql_client.insert_one('disease', dis_data)
      # print(movie_data)

引用的代码:
刘祥龙
17379710817

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值