1.新建数据库:lover
2.新建表:qinghua
重点:
# 用法:truncate table 表名; 数据库表数据清除后id改变从1开始递增
# -*- codeing = utf-8 -*-
# @Time:2021/8/10 0010 18:19
# @Author:权倾天下
# @File:情话网_面向对象.py
# @Software:PyCharm
# 爬取情话网里面的所有情话数据保存到数据库
import requests
from lxml import etree
import pymysql
class Spider_Qing_Hua():
def __init__(self):
self.url='http://www.ainicr.cn/tab/'
self.headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'Referer': 'http://www.ainicr.cn/'
}
self.db=pymysql.connect(user='root',password='201314',database='lover',charset='utf8')
self.cursor=self.db.cursor()
# 获取初始页面的数据
def get_html_data(self,url):
try:
response=requests.get(url,self.headers)
response.encoding=response.apparent_encoding
if response.status_code==200:
return response.text
except Exception as error:
print(error)
# 获取详情列表当中的url
def get_html_link(self,data):
html=etree.HTML(data)
# html_names=html.xpath('//div[@class="tjjjbar"]//ul[@class="tj_two"]//a/text()')
html_links=html.xpath('//div[@class="tjjjbar"]//ul[@class="tj_two"]//a/@href')
# print(html_links)
return html_links
# 解析得到下一页的详情url
def parse_link(self,res):
html=etree.HTML(res)
link=html.xpath('//h4/a/@href')
return link
# 获取情话的文本数据
def get_text(self,res):
html=etree.HTML(res)
text=html.xpath('//div[@class="stbody "]/a/p/text()|//div[@class="stbody first"]/a/p/text()')
# print(text)
for concent in text:
print(concent)
self.save_data_mysql(concent)
print('**********'*10)
# 保存数据到数据库
def save_data_mysql(self,data):
sql = 'insert into qinghua(text) values(%s)' # sql语句
self.cursor.execute(sql, [data]) # 执行sql语句 .execute要求以列表或者元组形式你参数
self.db.commit() # 提交
# 主程序
def run(self):
res=self.get_html_data(self.url)
# print(res)
links=self.get_html_link(res)
# print(links)
for link in links:
data=self.get_html_data(link)
urls=self.parse_link(data)
for url in urls:
print(url)
text_data=self.get_html_data(url)
self.get_text(text_data)
if __name__ == '__main__':
qinghua=Spider_Qing_Hua()
qinghua.run()