如果你有其他编程基础的话学习Python肯定不会太难,我的本命是java,但是java我也是个渣,几乎只会增删改查,做过最难的也是增删该查,不过基本功能模块以及上级交代的任务,我还是可以完成的,最近因为工作需要,我开始学习python,贴一下自己的代码,我用的是python3.7
import urllib.request
url = "http://www.baidu.com/"
# 打开所需要的网址
response = urllib.request.urlopen(url)
# read() 读取内容 decode() 二进制转换字符串 默认utf-8
print(response.read())
# 获取请求的url
print(response.geturl())
# 获取头部信息 返回列表里面有的元素 dict() 转换成 json类型
print(dict(response.getheaders()))
# 获取状态码
print(response.getcode())
# 按行读取 返回列表,都是字节类型
print(response.readlines())
'''保存至文件里面 文件名:baidu.html w 字符型打开类型
with open('baidu.html','w',encoding='utf8') as fp:
fp.write(response.read().decode())
'''
# 二进制类型 wb 二进制打开方式 创建文件
with open('baidu1.html','wb') as fp:
fp.write(response.read())
# 图片处理
image_url = "https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality=100&size=b4000_4000&sec=1562431650&di=be95dba31e377e497b6c5914ada7bd33&src=http://upload4.95171.cn/pic/AESH10001446/4655.jpg"
response = urllib.request.urlopen(image_url)
print(response)
# 保存图片只能写于本地二进制格式
with open('mienv.jpg','wb') as pf:
pf.write(response.read())
# 第二种保存图片方式 格式 文件url 文件名.jpg
urllib.request.urlretrieve(image_url,'chi.jpg')
刚开始感觉python好简单啊,而且兴趣贼强,写着写着发现好难但是跟Java比起来代码确实要少很多,比如连接MySQL直接导入pymysql 导包 pip install pymysql命令, 写几行代码就ok, 但是越往后越感觉难, 应该实我的代码量太少了,http协议什么的还有html标签class都需要了解, 后期反爬更难,不多我肯定要学习的,java~python共存 不管多难一定要学习,我感觉我的学习方法还是不行,靠的是走量来完成的,写的多了才能记住,也是应为这点让我发现了我是个普通人啊,IQ没有太高,记忆力也不行,所以要更加努力,加油,学习多少记多少,兴趣很重要,完成一点东西自己就感觉好骄傲。IT行业路难走,得一直追寻着新的技术的学习,追寻,我也想学习,但是路很迷茫,无法规划自己的路怎么走,学习东西三天两头给拉下,自学两三天就不想学了,就像学别的,技术诱惑太多,我无法便知哪是我的路,我特别希望我的技术路上有一位可以带我走出迷雾的大佬,我是真的在迷雾里面,对我而言实际开发了才有冲劲,才有解决一切困难的status,学习之路很漫长,也很遥远,希望我的道路上能碰到一位能带我走出迷雾的老师,在学校里是学生,出来社会是职场人,我想我的职场之路有老师以及一起学习的同学,写点鸡汤安慰一下自己:哎,写不出来,算了。希望能碰见一起学习互相帮忙可以比拼的人。
贴一下最近写的python爬虫爬取百度贴吧:应该是可以直接用的
"""
Python写的百度贴吧工具
"""
import pymysql
host = 'localhost'
db_name = 'test'
username = 'root'
password = 'Admin@123'
def _get_connection(host, username, password, db_name):
return pymysql.connect(host=host,
user=username,
password=password,
charset='utf8mb4',
db=db_name)
def _insert_table(connection, username):
insert_table_sql = """
INSERT INTO tieba_bing
VALUES(%s)"""
with connection.cursor() as cursor:
cursor.execute(insert_table_sql, (username))
connection.commit()
import urllib.request as request
from bs4 import BeautifulSoup
import re
import log_config
import logging
logger = logging.getLogger()
encoding = 'GBK'
base_url = 'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word=%BB%AC%B1%F9'
# base_url = 'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word=%B9%FD%C1%CB%BC%B4%CA%C7%BF%CD'
start_page = 1
total_pages = None
connection = _get_connection(host, username, password, db_name)
def _get_total_pages():
html = request.urlopen(base_url).read().decode(encoding)
soup = BeautifulSoup(html, 'lxml')
page_span = soup.find('span', class_='tbui_total_page')
p = re.compile(r'共(\d+)页')
result = p.match(page_span.string)
global total_pages
total_pages = int(result.group(1))
logger.info(f'会员共{total_pages}页')
def _find_all_users():
global connection
for i in range(start_page, total_pages + 1):
target_url = f'{base_url}&pn={i}'
logger.info(f'正在分析第{i}页')
html = request.urlopen(target_url).read().decode(encoding)
soup = BeautifulSoup(html, 'lxml')
outer_div = soup.find('div', class_='forum_info_section member_wrap clearfix bawu-info')
inner_spans = outer_div.find_all('span', class_='member')
for index, span in enumerate(inner_spans):
name_link = span.find('a', class_='user_name')
name = name_link.string
logger.info(f'已找到 {name}')
try:
_insert_table(connection, name)
except Exception as e:
logger.error(f'第{i}页{index}第个用户 {name} 发生异常')
import datetime
if __name__ == '__main__':
_get_total_pages()
_find_all_users()
import logging
# 创建Logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# 创建Handler
# 终端Handler
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
# 文件Handler
fileHandler = logging.FileHandler('log.log', mode='a', encoding='UTF-8')
fileHandler.setLevel(logging.ERROR)
# Formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
fileHandler.setFormatter(formatter)
# 添加到Logger中
logger.addHandler(consoleHandler)
logger.addHandler(fileHandler)
还有百度贴吧头像的:这个是放在文件夹里面,头像吧的
"""
Python写的百度贴吧爬取用户头像工具
"""
import pymysql
host = 'localhost'
db_name = 'test'
username = 'root'
password = 'Admin@123'
def _get_connection(host, username, password, db_name):
return pymysql.connect(host=host,
user=username,
password=password,
charset='utf8mb4',
db=db_name)
# def _insert_table(connection, username):
# insert_table_sql = """
# INSERT INTO tieba_bing
# VALUES(%s)"""
# with connection.cursor() as cursor:
# cursor.execute(insert_table_sql, (username))
# connection.commit()
import urllib.request as request
from bs4 import BeautifulSoup
import re
import log_config
import logging
import requests
import os
logger = logging.getLogger()
encoding = 'GBK'
base_url = 'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word=%CD%B7%CF%F1'
# base_url = 'http://tieba.baidu.com/bawu2/platform/listMemberInfo?word=%B9%FD%C1%CB%BC%B4%CA%C7%BF%CD'
start_page = 1
total_pages = None
connection = _get_connection(host, username, password, db_name)
file_path='D:/book/img1'
def _get_total_pages():
html = request.urlopen(base_url).read().decode(encoding)
soup = BeautifulSoup(html, 'lxml')
page_span = soup.find('span', class_='tbui_total_page')
p = re.compile(r'共(\d+)页')
result = p.match(page_span.string)
global total_pages
total_pages = int(result.group(1))
logger.info(f'会员共{total_pages}页')
def strip(path):
path = re.sub(r'[?\\*/"<>:/]', '', str(path))
return path
def _find_all_users():
global connection
for i in range(start_page, total_pages + 1):
target_url = f'{base_url}&pn={i}'
logger.info(f'正在分析第{i}页')
print(f'正在分析第{i}页')
html = request.urlopen(target_url).read().decode(encoding)
soup = BeautifulSoup(html, 'lxml')
outer_div = soup.find('div', class_='forum_info_section member_wrap clearfix bawu-info')
inner_spans = outer_div.find_all('span', class_='member')
for index, span in enumerate(inner_spans):
name_link = span.find('a', class_='user_name')
name_img = span.find('img')
print(name_img['src'])
name = name_link.string
url_img = name_img['src']
try:
# 是否有这个路径
if not os.path.exists(file_path):
# 创建路径
os.makedirs(file_path)
# 获得图片后缀
file_suffix = '.jpg'
print(file_suffix)
# 拼接图片名(包含路径)
filename = '{}{}{}{}'.format(file_path, os.sep, name, file_suffix)
print(filename)
# 下载图片,并保存到文件夹中
request.urlretrieve(url_img, filename=filename)
except IOError as e:
print("IOError")
except Exception as e:
print("Exception")
# response = requests.session().get(url_img)
# # 写入
# img_data = response.content
# with open('头像图片', 'wb') as fd:
# fd.write(img_data)
# print(url_img)
logger.info(f'已找到 {name}')
# print(name)
# try:
# _insert_table(connection, name)
# except Exception as e:
# logger.error(f'第{i}页{index}第个用户 {name} 发生异常')
import datetime
if __name__ == '__main__':
_get_total_pages()
_find_all_users()
import logging
# 创建Logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
# 创建Handler
# 终端Handler
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
# 文件Handler
fileHandler = logging.FileHandler('log.log', mode='a', encoding='UTF-8')
fileHandler.setLevel(logging.ERROR)
# Formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
fileHandler.setFormatter(formatter)
# 添加到Logger中
logger.addHandler(consoleHandler)
logger.addHandler(fileHandler)
希望可以帮助各位刚刚,我会定期把自己所学的发上来,希望大佬点评一下