python2.7环境下的python代码详情
#encoding=utf-8
# Created by double lin at 2018/8/19
#获取源代码
#解析页面,查看是否有需要的信息
#下载图片
#下载个人信息
#在练习中要学会进行,代码优化
#可读性强
#耦合度低,,面向对象
#独立模块,不要一开始到结尾
import json
import os
import requests
# 定义需要检索的年龄值
def query_age():
age = int(raw_input('请输入您想要匹配的年龄信息(如:20):'))
if 21 <= age <= 30:
startage = 21
endage = 30
elif 31 <= age <= 40:
startage = 31
endage = 40
elif 41 <= age <= 50:
startage = 41
endage = 50
elif 51 <= age <= 60:
startage = 51
endage = 60
else:
startage = 61
endage = 99
return startage, endage
# 定义需要进行检索的性别值
def query_sex():
sex = raw_input('请输入您想要匹配的性别信息(如:女):')
if sex == '男':
sex = 1
else:
sex = 2
return sex
# 定义城市信息--写死的信息格式
# 定义身高信息
def query_height():
height = int(raw_input('请输入您想要匹配的身高信息(如:161):'))
if height < 150:
startheight = 0
endheight = 150
elif 151 <= height <= 160:
startheight = 151
endheight = 160
elif 161 <= height <= 170:
startheight = 161
endheight = 170
elif 171 <= height <= 180:
startheight = 171
endheight = 180
elif 181 <= height <= 190:
startheight = 181
endheight = 190
else:
startheight = 191
endheight = 250
return startheight, endheight
# 月薪
def query_salary():
salary = int(raw_input('请输入您想要匹配的薪资水平(如:2000):'))
if 2000 <= salary <= 5000:
salary = 2
elif 5000 <= salary <= 10000:
salary = 3
elif 10000 <= salary <= 20000:
salary = 4
elif salary >= 20000:
salary = 5
else:
salary = 0
return salary
# 获取一个网页中所有的妹子图片
def get_one_page(url):
r = requests.get(url)
r.encoding = r.apparent_encoding
# r = r.text
# data = json.loads(r)['data']['list']
data = r.json()['data']['list']
for item in data:
# print item
download_img(item)
download_info(item)
# 判断是否存在文件夹,如果不存在,创建文件夹,然后下载图片到文件夹中
def download_img(item):
if not os.path.exists('images'):
os.mkdir('images')
username = item['username']
username = username.replace('*','')
username = username.replace('\\','')
username = username.replace('/', '')
imgUrl = item['avatar']
file_path = u'images/{}.png'.format(username)
if not os.path.exists(file_path):
# print '开始{}的信息下载'.format(username)
with open(file_path, 'wb') as f:
r = requests.get(imgUrl)
f.write(r.content)
f.close()
print '{}的信息下载完毕'.format(username)
else:
print '您要下载的图片已存在,继续下一张图片下载'
# 保存客户的个人信息
def download_info(item):
if not os.path.exists('info'):
os.mkdir('info')
item['username'] = item['username'].replace('*', '')
item['username'] = item['username'].replace('\\', '')
item['username'] = item['username'].replace('/', '')
file_path = u'info/{}.txt'.format(item['username'])
if not os.path.exists(file_path):
# print '开始{}的个人信息下载'.format(item['username'])
with open(file_path, 'wb') as f:
f.write('ID:'+item['userid']+'\n')
f.write('用户名:'+item['username']+'\n')
f.write('省份:'+item['province']+'\n')
f.write('城市:'+item['city']+'\n')
f.write('内心独白:'+item['monolog']+'\n')
f.write('学历:'+item['education']+'\n')
f.write('生日年份:'+item['birthdayyear']+'\n')
f.close()
if __name__ == '__main__':
# 链式赋值
startage, endage = query_age()
sex = query_sex()
startheight, endheight = query_height()
salary = query_salary()
for page in range(1,11):
url = 'http://www.lovewzly.com/api/user/pc/list/search?startage={}&endage={}&gender={}&cityid=180&startheight={}&endheight={}&salary={}&marry=1&page={}'.format(startage,endage,sex,startheight,endheight,salary,page)
get_one_page(url)
Python3.7环境下的爬虫代码信息
#encoding=utf-8
# Created by double lin at 2018/8/19
#获取源代码
#解析页面,查看是否有需要的信息
#下载图片
#下载个人信息
#在练习中要学会进行,代码优化
#可读性强
#耦合度低,,面向对象
#独立模块,不要一开始到结尾
import json
import os
import requests
import re
# 定义需要检索的年龄值
def query_age():
age = int(input('请输入您想要匹配的年龄信息(如:20):'))
if 21 <= age <= 30:
startage = 21
endage = 30
elif 31 <= age <= 40:
startage = 31
endage = 40
elif 41 <= age <= 50:
startage = 41
endage = 50
elif 51 <= age <= 60:
startage = 51
endage = 60
else:
startage = 61
endage = 99
return startage, endage
# 定义需要进行检索的性别值
def query_sex():
sex = input('请输入您想要匹配的性别信息(如:女):')
if sex == '男':
sex = 1
else:
sex = 2
return sex
# 定义城市信息--写死的信息格式
# 定义身高信息
def query_height():
height = int(input('请输入您想要匹配的身高信息(如:161):'))
if height < 150:
startheight = 0
endheight = 150
elif 151 <= height <= 160:
startheight = 151
endheight = 160
elif 161 <= height <= 170:
startheight = 161
endheight = 170
elif 171 <= height <= 180:
startheight = 171
endheight = 180
elif 181 <= height <= 190:
startheight = 181
endheight = 190
else:
startheight = 191
endheight = 250
return startheight, endheight
# 月薪
def query_salary():
salary = int(input('请输入您想要匹配的薪资水平(如:2000):'))
if 2000 <= salary <= 5000:
salary = 2
elif 5000 <= salary <= 10000:
salary = 3
elif 10000 <= salary <= 20000:
salary = 4
elif salary >= 20000:
salary = 5
else:
salary = 0
return salary
# 获取一个网页中所有的妹子图片
def get_one_page(url):
r = requests.get(url)
r.encoding = r.apparent_encoding
# r = r.text
# data = json.loads(r)['data']['list']
data = r.json()['data']['list']
for item in data:
print (item)
download_img(item)
download_info(item)
# 判断是否存在文件夹,如果不存在,创建文件夹,然后下载图片到文件夹中
def download_img(item):
if not os.path.exists('images'):
os.mkdir('images')
username = item['username']
username = username.replace('*','')
username = username.replace('\\','')
username = username.replace('/', '')
imgUrl = item['avatar']
file_path = u'images/{}.png'.format(username)
if not os.path.exists(file_path):
# print '开始{}的信息下载'.format(username)
with open(file_path, 'wb') as f:
r = requests.get(imgUrl)
f.write(r.content)
f.close()
print ('{}的信息下载完毕'.format(username))
else:
print ('您要下载的图片已存在,继续下一张图片下载')
# 保存客户的个人信息
def download_info(item):
if not os.path.exists('info'):
os.mkdir('info')
# print (type(item['userid']))
# print (type(item['username']))
item['username'] = item['username'].replace('*', '')
item['username'] = item['username'].replace('\\', '')
item['username'] = item['username'].replace('/', '')
file_path = u'info/{}.txt'.format(item['username'])
if not os.path.exists(file_path):
# print '开始{}的个人信息下载'.format(item['username'])
# rstr = r"[\/\\\:\*\?\"\<\>\|\-]" # '/ \ : * ? " < > |'
# item['username'] = re.sub(rstr, "_", item['username']) # 替换为下划线
with open(file_path, 'wb') as f:
# f.write('ID:%d' %int(item['userid'])+'\n')
# 在python3的环境中进行向txt文件中写入字符串的操作时,需要以bytes的形式将我们的字符串写入到文件中,
# 采用的方式为(f.write(bytes(string=yourString, '编码格式(如:utf-8)'))
f.write(bytes('用户名:'+item['username'] +'\n', 'UTF-8'))
f.write(bytes('省份:' + item['province'] +'\n', 'UTF-8'))
f.write(bytes('城市:' + item['city'] +'\n', 'UTF-8'))
f.write(bytes('学历:' + item['education'] +'\n', 'UTF-8'))
f.write(bytes('内心独白:' + item['monolog'] +'\n', 'UTF-8'))
f.write(bytes('生日:' + item['birthdayyear'] +'\n', 'UTF-8'))
f.close()
# 定义函数,删除一个文本中出现的包含表情的特殊字符
def remove_emoji(text):
return emoji_pattern.sub(r'', text)
if __name__ == '__main__':
startage, endage = query_age()
sex = query_sex()
startheight, endheight = query_height()
salary = query_salary()
# 暂时没使用到这个部分进行用户名中特殊字符的判断
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|" # emoticons
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols
u"(\ud83c[\udde0-\uddff])" # flags (iOS)
"+", flags=re.UNICODE)
for page in range(1,11):
url = 'http://www.lovewzly.com/api/user/pc/list/search?startage={}&endage={}&gender={}&cityid=180&startheight={}&endheight={}&salary={}&marry=1&page={}'.format(startage,endage,sex,startheight,endheight,salary,page)
get_one_page(url)
困扰点:python3中将文件写入到txt文件中,使用的方法与python2稍有不同:
python2 :
with open(file_path, 'wb') as f:
f.write('ID:'+item['userid']+'\n')
f.write('用户名:'+item['username']+'\n')
f.write('省份:'+item['province']+'\n')
f.write('城市:'+item['city']+'\n')
f.write('内心独白:'+item['monolog']+'\n')
f.write('学历:'+item['education']+'\n')
f.write('生日年份:'+item['birthdayyear']+'\n')
f.close()
python3:
with open(file_path, 'wb') as f:
# f.write('ID:%d' %int(item['userid'])+'\n')
# 在python3的环境中进行向txt文件中写入字符串的操作时,需要以bytes的形式将我们的字符串写入到文件中,
# 采用的方式为(f.write(bytes(string=yourString, '编码格式(如:utf-8)'))
f.write(bytes('用户名:'+item['username'] +'\n', 'UTF-8'))
f.write(bytes('省份:' + item['province'] +'\n', 'UTF-8'))
f.write(bytes('城市:' + item['city'] +'\n', 'UTF-8'))
f.write(bytes('学历:' + item['education'] +'\n', 'UTF-8'))
f.write(bytes('内心独白:' + item['monolog'] +'\n', 'UTF-8'))
f.write(bytes('生日:' + item['birthdayyear'] +'\n', 'UTF-8'))
f.close()
谢谢大家!!