# 检测语言
import langid
info = "摺疊式自行車後視鏡同"
lineTuple = langid.classify(info) #调用langid来对该行进行语言检测
# if lineTuple[0] == "zh": #如果该行语言大部分为中文,则不进行任何处理
print(lineTuple[0] == "zh")
def processing_data(content_list):
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
for i, content in enumerate(content_list):
for x, info in enumerate(content):
worksheet.write(i, x, label=info) # 将数据存入excel
# 保存
workbook.save('汽车2.xls')
# 美团接口
http://api.meituan.com/group/v4/deal/select/city/30/cate/1?sort=solds&hasGroup=true&mpt_cate1=1&offset=2&limit=100
http://meishi.meituan.com/i/api/comment/deal
# 存入excel 时间命名
def processing_data(content_list):
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding='utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
# 参数对应 行, 列, 值
if not os.path.isdir("info"):
os.mkdir("info") # 定位
for i, content in enumerate(content_list):
for x, info in enumerate(content):
worksheet.write(i, x, label=info) # 将数据存入excel
time_now = time.time()
timeArray = time.localtime(time_now)
time_now = time.strftime("%Y--%m--%d-%H-%M-%S", timeArray)
# 保存
workbook.save('info/%s.xls' % time_now)
# 创建文件夹代码
if not os.path.exists(path):
os.makedirs(path)
# 新浪微博
https://git.oschina.net/AJay13/ECommerceCrawlers/tree/master/WeiboCrawler
# 超时判断
@retry(stop_max_attempt_number=10, wait_fixed=2000)
def get_all_modules(url):
url = "https://app.pluralsight.com/learner/content/courses/" + url
# maya-2019-fundamentals-dynamics-lighting-rendering
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"cookie": cookie
}
print("获取所有信息,请稍等。。。")
content = requests.get(url, headers=headers, timeout=3)
# 微博地区爬虫 上海
https://m.weibo.cn/api/container/getIndex?containerid=23065700428008631000000000000&page=2
# 微博高级搜索接口
https://weibo.cn/search/mblog?advanced=mblog&f=s
# 机器学习网站 Tf
http://www.tensorfly.cn/tfdoc/tutorials/mnist_beginners.html
# 全国位置信息
https://github.com/lzxue/WeiboDataShare
https://blog.csdn.net/chinagissoft/article/details/50864485 # csdn
# 微博 API
https://open.weibo.com/wiki/%E5%BE%AE%E5%8D%9AAPI
# 微博
https://place.weibo.com/map/?maploc=-1.006507,-1.006012,12z&uid=5337188542&luicode=20000174
#直接提取csv
first_line = True
for line in open('people.csv'):
if first_line:
first_line = False
else:
fields = line.strip().split(',')
# print just field 0 (name)
print(fields[0])
# 大学详细数据
# from openpyxl import Workbook
# def processing_data(content_list):
# wb = Workbook() # 创建文件对象
# # grab the active worksheet
# ws = wb.active # 获取第一个sheet
# for hh in content_list:
# ws.append(hh) # 写入多个单元格
# wb.save("weibo.xlsx")
# 正则判断邮箱
A、
r'^[a-z0-9][\w\.\-]*@[a-z0-9\-]+(\.[a-z]{2,5}){1,2}$'
# 微博粉丝接口
https://m.weibo.cn/profile/7294287819
# 微博定位
https://place.weibo.com/wandermap/search2?keyword=%E6%89%B6%E8%A5%BF%E6%9D%91
# 微博 个人信息接口
https://m.weibo.cn/api/container/getIndex?uid=5966875630&luicode=10000011&lfid=231522type%3D1%26t%3D10%26q%3D%23%E5%9E%83%E5%9C%BE%E5%88%86%E7%B1%BB%23&containerid=2302835966875630
# 微博 乌鲁木齐
https://weibo.com/p/1001018008665010000000000/checkin#place
# 镜像源
pip install pyinstaller 后面加上镜像源 -i https://pypi.tuna.tsinghua.edu.cn/simple(这是加快速度使用的)
# 微信密钥
7a96949d32b15e6087e8ce337bb016a8
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",
]
headers = {'User-Agent': random.choice(user_agent)}
# 随机获取一个请求头
def get_user_agent():
return random.choice(USER_AGENTS)
https://blog.csdn.net/WeiLanooo/article/details/101114434 # pycharm激活
https://blog.csdn.net/WeiLanooo/article/list/2 # 高手文章
https://blog.csdn.net/WeiLanooo/article/details/100812075 # 饿了么
ALTER user 'root'@'localhost' IDENTIFIED BY 'python'; # mysql修改密码
nohup python -u test.py > nohup.out 2>&1 & # 阿里云 python代码后台运行
https://weibo.com/5044281310/IhRw7a7Rg?filter=hot&root_comment_id=0&type=comment
from lxml import etree
html_get = etree.HTML(resp_text)
div_ok = html_get.xpath('//div[@id="mw-content-text"]')[0]
print(div_ok,type(div_ok))
div_content = etree.tostring(div_ok, pretty_print=True, method='html').decode('utf-8') # 转为字符串
requests post请求!!
url = "https://ciac.zjw.sh.gov.cn/JGBXMHtbaWsbsWeb/Czhtba/GetCzxzjHtList"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "AlteonP=AIn0baXdHKx+VmJP8KbFWg$$",
"Host": "ciac.zjw.sh.gov.cn"
}
data = {"PageSize":"10","PageIndex":"1","Data":{"QyJgdmz":"","Bjbh":"","Htmc":"","Wsslbh":"","HtWqRq_kssj":"","HtWqRq_jssj":"","HtWqzt":"","BjxxXmmc":""}}
ret = requests.post(url, headers=headers, data=json.dumps(data))
请求失败时记得加 content_type试一下 data类型要转化为json!
有些时候得到的数据需要 json.loads() 多次!
with open(jsonfile,'w',encoding = 'utf-8') as jfile:
#即添加参数 ensure_ascii=False,它默认的是Ture
json.dump(list_all_pages,jfile,ensure_ascii=False,indent = 4)
时间戳转换
from datetime import datetime
s = '2020-10-09 09:35:0'
f = '%Y-%m-%d %H:%M:%S'
# 时间字符串转时间戳, 时间字串 s和format 对应
# int(t) 转为整型
t = datetime.strptime(s, f).timestamp() # 1559856210.0
print(t, "\n")
import cv2
import numpy as np
import base64
numpy 转 base64
def numpy_to_base64(image_np):
data = cv2.imencode(‘.jpg’, image_np)[1]
image_bytes = data.tobytes()
image_base4 = base64.b64encode(image_bytes).decode(‘utf8’)
return image_base4
numpy 转 bytes
def numpy_to_bytes(image_np):
data = cv2.imencode('.jpg', image_np)[1]
image_bytes = data.tobytes()
return image_bytes
数组保存
def numpy_to_file(image_np):
filename = ‘你的文件名_numpy.jpg’
cv2.imwrite(filename,image_np)
return filename
bytes转数组
def bytes_to_numpy(image_bytes):
image_np = np.frombuffer(image_bytes, dtype=np.uint8)
image_np2 = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
return image_np2
bytes 转 base64
def bytes_to_base64(image_bytes):
image_base4 = base64.b64encode(image_bytes).decode(‘utf8’)
return image_base4
bytes 保存
def bytes_to_file(image_bytes):
filename = ‘你的文件名_bytes.jpg’
with open(filename,‘wb’) as f:
f.write(image_bytes)
return filename
文件 转 数组
def file_to_numpy(path_file):
image_np = cv2.imread(path_file)
return image_np
文件转 字节
def file_to_bytes(path_file):
with open(path_file,‘rb’) as f:
image_bytes = f.read()
return image_bytes
文件转base64
def file_to_base64(path_file):
with open(path_file,‘rb’) as f:
image_bytes = f.read()
image_base64 = base64.b64encode(image_bytes).decode(‘utf8’)
return image_base64
base64 转 bytes
def base64_to_bytes(image_base64):
image_bytes = base64.b64decode(image_base64)
return image_bytes
base64转数组
def base64_to_numpy(image_base64):
image_bytes = base64.b64decode(image_base64)
image_np = np.frombuffer(image_bytes, dtype=np.uint8)
image_np2 = cv2.imdecode(image_np, cv2.IMREAD_COLOR)
return image_np2
base64 保存
def base64_to_file(image_base64):
filename = ‘你的文件名_base64.jpg’
image_bytes = base64.b64decode(image_base64)
with open(filename, ‘wb’) as f:
f.write(image_bytes)
return filename