功能描述:
1、爬取网页1中的电影名称、评分、简介到mysql数据库中。
2、爬取网页2中的标题、时间、正文、采集时间到mysql数据库中。
使用的技术:requests 请求、xpath解析、mysql
xpath解析语法
//
子孙节点
/
直接子节点
.
选取当前节点
…
选取当前节点的父节点
@
选取属性
通过 Python 的 lxml 库,利用 XPath 进行 HTML 的解析。scrapy 封装了lxml也可以导入scrapy
任务1代码实现
"""
@Description:
@Usage:
@Author: liuxianglong
@Date: 2021/11/4 下午3:44
"""
import requests
from scrapy import Selector
import pymysql
from pymysql.cursors import DictCursor
def mysql_conn():
"""开发连接库"""
_conn = pymysql.connect(
host='localhost',
user='root',
passwd='123454321',
database='practice',
port=3306,
charset='utf8mb4'
)
_cur = _conn.cursor(DictCursor)
return _conn, _cur
class MysqlORM(object):
def __init__(self, conn, cur):
self.conn = conn
self.cur = cur
def insert_one(self, table: str, data: dict):
name = ','.join(data.keys())
col = ','.join('%({})s'.format(k) for k in data.keys())
sql = f'insert ignore into {table}({name}) values({col})'
self.cur.execute(sql, data)
self.conn.commit()
rowid = self.cur.lastrowid
print(f'{table} 插入一条数据 {rowid}')
return rowid
def update_one(self, table: str, data: dict, fixed: list):
fileds = [f'{name}=%({name})s' for name in data.keys() if name not in fixed]
where_phrase = [f'{name}=%({name})s' for name in fixed]
where = ' and '.join(where_phrase)
update_sql = f'update {table} set {",".join(fileds)} where {where}'
self.cur.execute(update_sql, data)
self.conn.commit()
print(f'{table} 更新一条数据到 {table} 成功')
# movie_data = {
# 'title': '小兵张嘎',
# 'publish_year': '2021',
# 'description': '...',
# 'score': '9.3'
# }
resp = requests.get('https://www.imdb.com/search/title/?count=100&title_type=feature,tv_series,tv_movie&ref_=nv_ch_mm_1')
resp = Selector(text=resp.text)
if __name__=='__main__':
conn, cur = mysql_conn()
mysql_client = MysqlORM(conn, cur)
for movie_div in resp.xpath('//div[@class="lister-list"]/div'):
title = movie_div.xpath('.//h3/a/text()').get()
year=movie_div.xpath('.//h3//span[2]/text()').get()
intro=movie_div.xpath('.//p[@class="text-muted"]/text()').get()#点击get()才能获取到文本
#print(title)
score=movie_div.xpath('.//div[@class="ratings-bar"]//div[@name="ir"]//@data-value').get()
movie_data={}
movie_data['title']=title
movie_data['publish_year']=year
movie_data['description']=intro
movie_data['score']=score
mysql_client.insert_one('movie_table', movie_data)
#print(movie_data)
任务二
from datetime import datetime
import time
import requests
from scrapy import Selector
import pymysql
from pymysql.cursors import DictCursor
def mysql_conn():
"""开发连接库"""
_conn = pymysql.connect(
host='localhost',
user='root',
passwd='123454321',
database='practice',
port=3306,
charset='utf8mb4'
)
_cur = _conn.cursor(DictCursor)
return _conn, _cur
class MysqlORM(object):
def __init__(self, conn, cur):
self.conn = conn
self.cur = cur
def insert_one(self, table: str, data: dict):
name = ','.join(data.keys())
col = ','.join('%({})s'.format(k) for k in data.keys())
sql = f'insert ignore into {table}({name}) values({col})'
self.cur.execute(sql, data)
self.conn.commit()
rowid = self.cur.lastrowid
print(f'{table} 插入一条数据 {rowid}')
return rowid
def update_one(self, table: str, data: dict, fixed: list):
fileds = [f'{name}=%({name})s' for name in data.keys() if name not in fixed]
where_phrase = [f'{name}=%({name})s' for name in fixed]
where = ' and '.join(where_phrase)
update_sql = f'update {table} set {",".join(fileds)} where {where}'
self.cur.execute(update_sql, data)
self.conn.commit()
print(f'{table} 更新一条数据到 {table} 成功')
if __name__=='__main__':
conn, cur = mysql_conn()
mysql_client = MysqlORM(conn, cur)
resp = requests.get('https://www.who.int/emergencies/disease-outbreak-news')
resp = Selector(text=resp.text)
i=0;
for dis_div in resp.xpath('//div[@class="sf-list-vertical"]/a'):
i=i+1;
print(i);
p_time= dis_div.xpath('.//h4//span[2]//text()').get()[:-3]#.前一个节点
title = dis_div.xpath('.//h4//span[3]//text()').get()
href=dis_div.xpath('.//@href').get()
resp_2 = requests.get(href)
resp_2= Selector(text=resp_2.text)
lst = []
for cont in resp_2.xpath('//article//p'):
for pc in cont.xpath('.//text()'):
t=pc.get()
lst.append(t)
strp=''.join(lst)
#print("正文:%s" % strp)
dis_data = {}
dis_data['title'] = title
print("%s的长度%d" % (p_time,len(p_time)))
tup = time.strptime(p_time, "%d %B %Y")
tim = time.asctime(tup)
dis_data['publish_time'] = tim
dis_data['collect_time'] = time.asctime(time.localtime())
print(dis_data['collect_time'])
dis_data['content'] = strp
dis_data['url']= href
#print(dis_data)
#print(dis_data)
mysql_client.insert_one('disease', dis_data)
# print(movie_data)
引用的代码:
刘祥龙
17379710817