1. 安装 pymysql 只是参考,后面不用
> pip install pymysql
文档: https://pymysql.readthedocs.io/en/latest/user/examples.html
# +----------------------------------------------------------------------
# | User: zq
# | Version: python3.7
# | Time: 2020-03-09 15:40
# +----------------------------------------------------------------------
import pymysql.cursors
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='root',
db='spider',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Create a new record
sql = "INSERT INTO `users` (`email`, `password`) VALUES (%s, %s)"
cursor.execute(sql, ('webmaster@python.org', 'very-secret'))
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
with connection.cursor() as cursor:
# Read a single record
sql = "SELECT `id`, `password` FROM `users` WHERE `email`=%s"
cursor.execute(sql, ('webmaster@python.org',))
result = cursor.fetchone()
print(result)
finally:
connection.close()
2. peewee 可以替代 pymysql,用这个 orm来做中间层
https://github.com/coleifer/peewee
http://docs.peewee-orm.com/en/latest/peewee/quickstart.html#storing-data
安装 pip install peewee
# +----------------------------------------------------------------------
# | User: zq
# | Version: python3.7
# | Time: 2020-03-09 16:30
# +----------------------------------------------------------------------
from peewee import *
#注意 post 是 number类型,不要加双引号
db = MySQLDatabase("spider", host="localhost", port=3306, user="root", password="root")
class Person(Model):
name = CharField()
birthday = DateField()
class Meta:
database = db # This model uses the "people.db" database
# table_name = "users" #可以自定义,如果不写,表名则为 person
if __name__ == "__main__":
db.create_tables([Person]) #新建表
- port 是number类型,不能加 “”
- db.create_tables([Person]) 如果有多个类,可以都放进去
- table_name 这个配置如果写,就是自定义表名,如果不写,则是默认的类名为表名称
- 上面没有写主键,会自动给我们加一个 id 的主键。
class Person(Model):
ids=CharField(max_length=20, primary_key=True)
name = CharField(max_length=20, null=True)
answer_nums = IntegerField(default=0) #回复数量
birthday = DateField()
- 类型中还可以指明属性
- max_length=20, 表示长度是20
- primary_key=True ,表示是主键,如果不写,则会添加 id 这个默认主键
- null=True, 表示可以为空。如果不写,默认都是不为空。
- 如果自己定义了 id=CharField() 这个时候注意了,默认主键id就失效了,需要自己指定主键。
- default=0 可以设置一个默认值
1 新增数据
if __name__ == "__main__":
db.create_tables([Person])
#1. 新增数据
from datetime import date
uncle_bob = Person(name='Bob', birthday=date(1960, 1, 15))
uncle_bob.save() # bob is now stored in the database
2 获取数据,获取1条 get方法,注意:get方法取不到数据的时候会抛出异常。要用 try语句。
#2. 查询数据 获取1条,和获取多条
# 获取1条
# grandma = Person.select().where(Person.name == 'Bob').get()
# 简化如下
try:
grandma = Person.get(Person.name == 'Bob')
print(grandma.birthday)
except Exception as e:
print('Error:', e)
3 获取数据,获取多条,取不到数据,不会抛出异常
#3. 查询数据, 获取多条
query = Person.select().where(Person.name == 'Bob')
for pet in query:
print(pet.name, pet.birthday)
4 修改数据 save() 在没有数据存在的时候新增数据,存在数据的时候修改数据
如果自己设置了id,然后新增的时候,需要在save里面加上 save(force_insert=True) 这个参数才可以。不然程序以为是更新,而不是新增
#4. 修改数据
query = Person.select().where(Person.name == 'Bob')
from datetime import date
for pet in query:
pet.birthday = date(2021, 10, 12)
pet.save()
pass
save(force_insert=True) 表示强制新增,如果没有这个参数,表示更新
try:
existed_topics = Topic.select().where(Topic.id == topic.id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
except Exception as e:
pass
5 删除数据 .delete_instance() 方法
#5. 删除数据
query = Person.select().where(Person.name == 'Bob')
for pet in query:
pet.delete_instance() #删除数据
pass
7-5 models 表结构设计
#设计数据表的时候有几个重要点
#1. char类型要设置最大长度,默认255
#2. 对于无法确定最大长度的字段,可以设置为 Text
#3. 设计表的时候,采集到的数据要尽量先做格式化处理
#4. default 和 null=True的情况
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-09 20:48
# +--------------------------
from peewee import *
db = MySQLDatabase("spider", host="localhost", port=3306, user="root", password="root")
class BaseModel(Model):
class Meta:
database = db
# 设计数据表的时候有几个重要点
# 1. char类型要设置最大长度,默认255
# 2. 对于无法确定最大长度的字段,可以设置为 Text
# 3. 设计表的时候,采集到的数据要尽量先做格式化处理
# 4. default 和 null=True的情况
# 文章列表
class Topic(BaseModel):
title = CharField() # 标题
content = TextField(default='') # 内容
id = IntegerField(primary_key=True) # id
author = CharField(default='') # 作者
create_time = DateTimeField() # 创建时间
answer_nums = IntegerField(default=0) # 回复数量
click_nums = IntegerField(default=0) # 点击数量
parised_nums = IntegerField(default=0) # 点赞数
jtl = FloatField(default=0.0) # 结帖率
score = IntegerField(default=0) # 赏分
status = CharField() # 状态
# 具体文章表
class Answer(BaseModel):
topic_id = IntegerField() # 文章id
author = CharField() # 回答者姓名
content = TextField() # 回答内容
create_time = DateTimeField() # 回答时间
parised_nums = IntegerField(default=0) # 点赞数
# 人员表
class Author(BaseModel):
name = CharField()
id = CharField(primary_key=True)
click_nums = IntegerField(default=0) # 访问数
original_nums = IntegerField(default=0) # 原创数
forward_nums = IntegerField(default=0) # 转发数
rate = IntegerField(default=1) # 排名
answer_nums = IntegerField(default=0) # 评论数
parised_nums = IntegerField(default=0) # 获赞数
desc = TextField(null=True) # 简介
industry = CharField(null=True) # 行业
location = CharField(null=True) # 位置
follower_nums = IntegerField(default=0) # 粉丝数
following_nums = IntegerField(default=0) # 关注数
if __name__ == "__main__":
db.create_tables([Topic, Answer, Author])
7-6 分析和获取所有的模块
parse.urljoin 帮我们处理url拼接的问题
from urllib import parse
domain = "http://www.csdn.com"
url = parse.urljoin(domain, '/forums/ios')
输出url http://www.csdn.com/forums/ios
如果后面是一个完整路径的话
url = parse.urljoin(domain, 'http://www.baidu.com/forums/ios')
输出url http://www.baidu.com/forums/ios,
这个模板帮我们做了,自动识别后面的路径,如果是完整路径,则忽略前面我们设置的 domain, 比较常用
目前代码,是解析页面,然后把需要抓取的路径分析出来,拼接成 list 。保存下来。
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-09 21:20
# +--------------------------
"""
思路:
1. 抓取
2. 解析
3. 存储
"""
import re
import ast # str转为list
from urllib import parse
import requests
from csdn_spider.models import *
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'
# 获取js,然后通过正则提取出list数据
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace('null', 'None')
nodes_list = ast.literal_eval(nodes_str)
return nodes_list
return []
# url的list
url_list = []
# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
# 将 json的格式提取出 url到list中
for item in nodes_list:
if "url" in item:
if item['url']:
url_list.append(item['url'])
if 'children' in item:
process_nodes_list(item['children'])
# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
level1_url = []
for item in nodes_list:
if 'url' in item and item['url']:
level1_url.append(item['url'])
return level1_url
# 获取最终需要抓取的url的list数据
def get_last_urls():
# 获取 list数据
nodes_list = get_nodes_json()
# 处理list数据得到url的list
process_nodes_list(nodes_list)
# 获取1级的url
level1_url = get_level1_list(nodes_list)
# 经过处理之后的 url 的list
last_urls = []
# 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
for url in url_list:
if url not in level1_url:
last_urls.append(url)
all_urls = []
# csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
for url in last_urls:
all_urls.append(parse.urljoin(domain, url))
all_urls.append(parse.urljoin(domain, url + '/recommend'))
all_urls.append(parse.urljoin(domain, url + 'closed'))
return all_urls
if __name__ == "__main__":
last_urls = get_last_urls()
print(last_urls)
print(len(last_urls))
最终代码:
思路: 先抓取到链接,然后根据下一页的链接,继续抓取
抓到到得了链接,解析一部分数据到数据表,保存的时候,先查询是否存在,然后再保存。再点击到内容页继续解析内容,把字段再保存到数据表中。
核心: 设计合理的方法抓取链接,然后解析保存到数据表。
# +--------------------------
# | User: zq -
# | Version: python3.7 -
# | Time: 2020-03-09 21:20
# +--------------------------
"""
思路:
1. 抓取
2. 解析
3. 存储
"""
import re
import ast # str转为list
from urllib import parse
from datetime import datetime # 把字符串转为时间类型
import requests
from scrapy import Selector # html选择器
from csdn_spider.models import *
# 需要抓取的 csdn 域名
domain = 'https://bbs.csdn.net/'
# 获取js,然后通过正则提取出list数据
def get_nodes_json():
left_menu_text = requests.get("https://bbs.csdn.net/dynamic_js/left_menu.js?csdn").text
nodes_str_match = re.search("forumNodes: (.*])", left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace('null', 'None') # 把js里面的None改成null
nodes_list = ast.literal_eval(nodes_str) # 把str转为list
return nodes_list
return []
# url的list
url_list = []
# 处理url,把url都提取到 url_list中
def process_nodes_list(nodes_list):
# 将 json的格式提取出 url到list中
for item in nodes_list:
if "url" in item:
if item['url']:
url_list.append(item['url'])
if 'children' in item:
process_nodes_list(item['children'])
# 获取顶级的url,后面去掉,因为顶级的url只是,下面的聚合.
def get_level1_list(nodes_list):
level1_url = []
for item in nodes_list:
if 'url' in item and item['url']:
level1_url.append(item['url'])
return level1_url
# 获取最终需要抓取的url的list数据
def get_last_urls():
# 获取 list数据
nodes_list = get_nodes_json()
# 处理list数据得到url的list
process_nodes_list(nodes_list)
# 获取1级的url
level1_url = get_level1_list(nodes_list)
# 经过处理之后的 url 的list
last_urls = []
# 如果 url在 url_list 中但是不在 1级中则加入到 last_urls中
for url in url_list:
if url not in level1_url:
last_urls.append(url)
all_urls = []
# csdn默认路径为 未解决, recommend为精华,closed为以解决,我们要抓取这3个地址,所以都要拼接出来
for url in last_urls:
all_urls.append(parse.urljoin(domain, url))
all_urls.append(parse.urljoin(domain, url + '/recommend'))
all_urls.append(parse.urljoin(domain, url + 'closed'))
return all_urls
# 解析详情页
def parse_topic(url):
# 获取帖子的详情以及回复
topic_id = url.split('/')[-1]
res_text = requests.get(url).text
sel = Selector(text=res_text)
all_divs = sel.xpath("//div[starts-with(@id, 'post-')]") # xpath中的方法,已什么开头
topic_item = all_divs[0]
content = topic_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] # 内容
praised_nums = topic_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0] # 点赞数量
jtl = 0
if topic_item.xpath(".//div[@class='close_topic']/text()").extract():
jtl_str = topic_item.xpath(".//div[@class='close_topic']/text()").extract()[0] # 结帖率
jtl_match = re.search("(\d+\.?\d+)%", jtl_str)
if jtl_match:
jtl = jtl_match.group(1)
existed_topics = Topic.select().where(Topic.id == topic_id)
# 完成topic的更新,把少的几个字段加入进来
if existed_topics:
topic = existed_topics[0]
topic.content = content
topic.jtl = jtl
topic.praised_nums = praised_nums
topic.save()
for answer_item in all_divs[1:]:
answer = Answer()
answer.topic_id = topic_id # 这篇文章id
author_info = answer_item.xpath(".//div[@class='nick_name']//a[1]/@href").extract()[0]
answer.author = author_info.split('/')[-1] # 回帖作者id
create_time = answer_item.xpath(".//label[@class='date_time']/text()").extract()[0]
create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M:%S')
answer.create_time = create_time # 回帖时间
content = answer_item.xpath(".//div[@class='post_body post_body_min_h']").extract()[0] # 内容
answer.content = content
praised_nums = answer_item.xpath(".//label[@class='red_praise digg']/em/text()").extract()[0] # 点赞数量
answer.praised_nums = int(praised_nums)
answer.save()
# 解析下一页
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
# 如果下一页存在,就取到下一页的连接,放入 next_url
if next_page:
next_url = parse.urljoin(domain, next_page[0])
# 继续解析刚刚得到的下一页的地址
parse_topic(next_url)
# 解析用户详情页面
def parse_author(url):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
# }
# res_text = requests.get(url, headers=headers).text
# sel = Selector(text=res_text)
pass
# 解析列表页
def parse_list(url):
res_text = requests.get(url).text # 获取url的内容
sel = Selector(text=res_text) # 获取selector对象
# 我们找的信息,进过观察在 table下面的 tr中
all_trs = sel.xpath("//table[@class='forums_tab_table']/tbody//tr")
# all_trs = sel.xpath("//table[@class='forums_tab_table']//tr")[2:] #这种写法也可以
for tr in all_trs:
topic = Topic()
if tr.xpath(".//td[1]/span/text()").extract():
status = tr.xpath(".//td[1]/span/text()").extract()[0] # 状态 "未结" "已结" "满意"
topic.status = status
if tr.xpath(".//td[2]/em/text()").extract():
score = tr.xpath(".//td[2]/em/text()").extract()[0] # 赏分
topic.score = int(score)
if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract():
topic_url = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/@href").extract()[0] # 标题链接,相对地址
topic.id = int(topic_url.split('/')[-1]) # 文章的id
topic_url = parse.urljoin(domain, topic_url) # 相对地址换成绝对地址,方便后续抓取
if tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract():
topic_title = tr.xpath(".//td[3]/a[contains(@class, 'forums_title')]/text()").extract()[0] # 标题内容
topic.title = topic_title
if tr.xpath("//td[4]/a/text()").extract():
author_url = tr.xpath(".//td[4]/a/@href").extract()[0] # 作者链接,相对地址
author_id = author_url.split('/')[-1] # 作者id
author_url = parse.urljoin(domain, author_url) # 作者链接,改成绝对地址
topic.author = author_id
# 解析用户详情页面
parse_author(author_url)
if tr.xpath(".//td[4]/em/text()").extract():
create_time = tr.xpath(".//td[4]/em/text()").extract()[0] # 创建时间字符串类型
create_time = datetime.strptime(create_time, '%Y-%m-%d %H:%M') # 创建时间转为时间类型
topic.create_time = create_time
if tr.xpath(".//td[5]/span/text()").extract():
answer_info = tr.xpath(".//td[5]/span/text()").extract()[0] # 回复查看数量
answer_nums = answer_info.split('/')[0] # 回复数量
click_nums = answer_info.split('/')[1] # 查看数量
topic.click_nums = int(click_nums)
topic.answer_nums = int(answer_nums)
if tr.xpath(".//td[6]/em/text()").extract():
last_time_str = tr.xpath(".//td[6]/em/text()").extract()[0] # 最后回复时间
last_time = datetime.strptime(last_time_str, '%Y-%m-%d %H:%M') # 把字符串转为时间类型
topic.last_answer_time = last_time
try:
existed_topics = Topic.select().where(Topic.id == topic.id)
if existed_topics:
topic.save()
else:
topic.save(force_insert=True)
except Exception as e:
pass
# 解析帖子内容页面
# parse_topic(topic_url)
# 解析下一页
next_page = sel.xpath("//a[@class='pageliststy next_page']/@href").extract()
# 如果下一页存在,就取到下一页的连接,放入 next_url
if next_page:
next_url = parse.urljoin(domain, next_page[0])
# 继续解析刚刚得到的下一页的地址
parse_list(next_url)
if __name__ == "__main__":
last_urls = get_last_urls()
for url in last_urls:
parse_list(url)