帮助我们快速的生成包含一堆数据的列表 [i+10 for in range(10)]–>[10,11,12,…,19] ["10月{}日".format(i) for i in range(1,10)
字典推导式
帮助我们快速的生成包含一堆数据的字典
{"a{}.foramt(i)":10for i in range(3)}
#{"a0":10,"a1":"10","a2":"10"}
三元运算符
a = 10if3<4else20
a = 10if4<3else20
python基础扩展format函数
在字符串的参数使用{NUM}表示,0代表第一个,1代表第二个,:表示需要的操作,例子如下
age = 25
name = 'Caroline'
print('{0} is {1} years old. '.format(name, age)) #输出参数
print('{0} is a girl. '.format(name))
print('{0:.3} is a decimal. '.format(1/3)) #小数点后三位
print('{0:_^11} is a 11 length. '.format(name)) #使用_补齐空位
print('{first} is as {second}. '.format(first=name, second='Wendy')) #别名替换
print('My name is {0.name}'.format(open('out.txt', 'w'))) #调用方法
print('My name is {0:8}.'.format('Fred')) #指定宽度
python 中strip()函数
用于你出字符串头尾指定的字符(只能是头尾),并返回行的字符串
str.strip("0")#去除头尾的零
str.strip()#去除头尾的空白
print()中加逗号
输出在同一行,以空格隔开
python错误
字符串中不可以用双引号
将爬去数据装到数据库(使用模块pymysql)
# -*- coding: utf-8 -*-#强制用utf-8编码import requests
import json
import pymysql
from lxml import etree
classQiuBaiSpder:def__init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36"}
defget_url_list(self):
url_list = [self.url_temp.format(i) for i in range(1, 14)]
return url_list
defparse_url(self, url):
response = requests.get(url, headers=self.headers)
return response.content.decode()
defget_content_list(self, html_str):
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div")
content_list = []
for div in div_list:
item = {}
item["author_name"] = div.xpath(".//h2/text()")[0].strip() if len(div.xpath(".//h2/text()")) > 0elseNone
item["content"] = div.xpath(".//div[@class='content']/span[1]/text()")
item["content"] = [i.strip() for i in item["content"]]
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0elseNone
print(item["stats_vote"])
print(type(item["stats_vote"]))
item["stats_comments"] = div.xpath(".//span[@class='stats-comments']//i/text()")
item["stats_comments"] = item["stats_comments"][0] if len(item["stats_comments"]) > 0elseNone
print(item["stats_comments"])
print(type(item["stats_comments"]))
content_list.append(item)
return content_list
defsave_content_list(self, content_list):with open("qiubai.txt", "a", encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content, ensure_ascii=False, ))#以utf-8编码格式写入#print(content)#print(type(content["stats_vote"]))# print(type(content["stats_vote"])#print("aaaaaaaaaaaaaaaaaaaaaaaaaaa")
f.write("\n")
print("save")
defsave_mysql(self,content_list):
connect = pymysql.connect(host="192.168.43.122",
user="wei",
password="123456",
db="gaodb",
charset="utf8mb4",#有表情,要是用utf8mb4编码
use_unicode=True#使用Unicode,因为
)
cursor = connect.cursor()
'''
下面三行的内容十分重要,不然导入数据库时会出错
'''
cursor.execute('SET NAMES utf8mb4')
cursor.execute("SET CHARACTER SET utf8mb4")
cursor.execute("SET character_set_connection=utf8mb4")
# cursor.execute("set username utf8mb4")# cursor.execute("set content utf8mb4")# cursor.execute("set vote utf8mb4")# cursor.execute("set stats_vote utf8mb4")for content in content_list:
i = (content["author_name"],content["content"][0],content["stats_vote"],content["stats_comments"])
print(i)
sql = "insert into demo1 (username,content,vote,comments) values(%s,%s,%s,%s)"
cursor.execute(sql,i)
connect.commit()
cursor.close()
connect.commit()
defrun(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.parse_url(url)
content_list = self.get_content_list(html_str)
# self.save_content_list(content_list)
self.save_mysql(content_list)
if __name__ == '__main__':
qiubai = QiuBaiSpder()
qiubai.run()