python3简单爬虫_爬虫使用-简单(python3入门)

最新推荐文章于 2023-11-22 20:05:12 发布

weixin_39982452

最新推荐文章于 2023-11-22 20:05:12 发布

阅读量590

点赞数

文章标签： python3简单爬虫

爬虫是什么？

- 每个网站都有爬虫协议，(例如：https://www.baidu.com/robots.txt，这里会写清楚哪些允许哪些不被允许)

- 可见即可爬（技术上）

- 违法的：擦边球

一、request模块（模拟发请求的模块）

- 安装：pip3 install requests. ---urllib,urllib2 （这两个是py内置的），requests模块是基于这两个模块封装的#**** 基本使用 ****#导入模块#import requests#

## 发送get请求，有返回结果#res = requests.get('https://www.baidu.com')#

## 请求回来的内容#print(res.text)#

#with open('a.html','w',encoding='utf-8') as f:#f.write(res.text)#

## 请求返回的状态码#print(res.status_code)

requests模块介绍

#**** 携带参数中文需要编码****#import requests#from urllib.parse import urlencode#

#key = input('请输入要搜索的内容')## 如果携带参数是中文或者其他特殊字符要做转码#key_search = urlencode({'wd':key})## print(key_search)#

## url = 'https://www.baidu.com/s?%s'%key_search#url = 'https://www.baidu.com/s?%s'%key_search#

## 反扒之一：携带http头信息 user-agent#res = requests.get(url,#headers={#'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#}#

#)#

#with open('a.html','w',encoding='utf-8') as f:#f.write(res.text)

request简单使用-1

#每次编码比较复杂，直接使用requests模块的参数#import requests#

#key = input('请输入要搜索的内容')#

## 反扒之一：携带http头信息 user-agent#res = requests.get('https://www.baidu.com/s',#headers={#'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#},## get形式携带的参数#params={'wd':key,'pn':70}#)#

#print(res.text)#

#with open('a.html','w',encoding='utf-8') as f:#f.write(res.text)

#cookie#import requests#

#Cookies={ 'user_session':'2OjTrCoCXigz5gB7trCsYfMBl0jQ-abqjXdCcas9UqwVmD7y',#}#

#response=requests.get('https://github.com/settings/emails',#cookies=Cookies) #github对请求头没有什么限制，我们无需定制user-agent，对于其他网站可能还需要定制#

#print('lich_qiu@163.com' in response.text) #True

requests简单使用-2

-get参数介绍

params=字典(get形式传的参数)

headers=字典()- User-Agent : 客户端类型-Referer : 从哪个地址调过来的(上一个地址)，图片防盗链-Host :- Cookie : '字符串'Cookie : {'user_session':'xxx'} 因为cookie比较特殊，经常用的到。正常应该放在请求头当中，requests模块单独处理了cookie-post参数介绍

params

headers

data:请求体的数据，默认用urlencoded格式

json:传字典，这样发送的请求编码格式是：'content-type': 'application/json'allow_redirect=False 是否允许重定向，默认是True，一般不会去更改。#第一步：向https://github.com/login 发送get请求，#import requests#import re#

#res_login = requests.get('https://github.com/login',#headers={#'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#}#)## print(res_login.text)#

## 返回数据中有个authenticity_token 取出来## re.S 把这个字符串当作一行#authenticity_token = re.findall(r'name="authenticity_token".*?value="(.*?)"', res_login.text, re.S)[0]#print(authenticity_token)#

## 取出没有登入的cookie#login_cookie = res_login.cookies.get_dict()#print(login_cookie)#

## 第二步：向https://github.com/session 携带用户名+密码并发送post请求#

#data = {#'commit': 'Sign in',#'utf8': '✓',#'authenticity_token': authenticity_token,#'login': 'lich_qiu@163.com',#'password': 'zhang319!',#'webauthn-support': 'supported'#}#

#res = requests.post(url='https://github.com/session ',#

## 请求体的数据#data=data,## 需要携带没有通过认证的cookie#cookies=login_cookie,#

#)#

## 正常登入成功，返回cookie，取出cookie，下次发请求，携带着cookie## res.cookies.get_dict() 把返回的cookie转成字典#res_cookie = res.cookies.get_dict()#print(res_cookie)#

## 第三步：访问https://github.com/settings/emails ，判断lich_qiu@163.com 是否在返回的数据中#

#response = requests.get('https://github.com/settings/emails',#headers={#'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',#'Referer': 'https://github.com/settings/profile'#},#cookies=res_cookie,#

#) # github对请求头没有什么限制，我们无需定制user-agent，对于其他网站可能还需要定制#

#print('lich_qiu@163.com' in response.text) # True

get和post介绍及简单使用

#编码问题#import requests#response = requests.get('http://www.autohome.com/news')#

## 当前页面编码方式#print(response.apparent_encoding)#

## 将编码方式改成gbk#response.encoding = 'gbk'#print(response.text)

编码问题

#爬图片#比较小的文件，可以一次性把content爬下来

#import requests#res = requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1556732811646&di=2bd8396b35047f33fbcd6b023f5787b7&imgtype=0&src=http%3A%2F%2Fs15.sinaimg.cn%2Fmw690%2F0066UWNtgy6Viz3mEBoce%26690')#

#with open('a.jpg','wb') as f:#f.write(res.content)

爬图片

#爬视频#文件较大，有个iter_content()方法来循环爬

#import requests#

#res = requests.get('http://static.yximgs.com/s1/videos/www_main-059ce9beee.mp4')#with open('a.mp4','wb')as f:#for i in res.iter_content():#f.write(i)

爬视频

#解析json#import requests#

#response = requests.get('http://httpbin.org/get')#

#import json#res1 = json.loads(response.text) # 太麻烦#

#res2 = response.json() #直接获取json数据#

#print(res1 == res2) #结果是True，结果一致

解析json

-响应responseprint(respone.text) ---输出文本的内容print(respone.content) ---输出二进制的内容print(respone.status_code) ---状态码print(respone.headers) ---响应头print(respone.cookies) ---返回的cookieprint(respone.cookies.get_dict()) ---把返回的cookie转成字典格式print(respone.cookies.items()) ---字典.items()print(respone.url) ---要重定向的地址print(respone.history) ---正常返回的数据print(respone.encoding) --- 返回数据的编码格式

响应头使用介绍

二、requests模块高级用法

#1 ssl cert verification

#verify = False 不认证证书#import requests#携带证书#response = requests.get('https://www.12306.cn',#cert = ('/path/server.crt/path/key'))

1 ssl cert verification

#2 使用代理#http代理#import requests#

#proxies = {#'http':'http://lich:123@112.85.151.216:9999', # 带用户密码的代理，@符号前面是用户名与密码#'http':'223.241.116.173:8010',#'https':'https://localhost:8888'#}#

#response = requests.get('https://www.12306.cn',proxies=proxies)#print(response.status_code)

2 使用代理

#socket代理#import requests#proxies = {#'http':'socks5://lich:123@112.85.151.216:9999', # 带用户密码的代理，@符号前面是用户名与密码## 'http':'socks5://223.241.116.173:8010',## 'https':'socks5://localhost:8888'#}#

#response = requests.get('https://www.12306.cn',proxies=proxies)#

#print(response.status_code)

3 socket代理

#超时设置#import requests#response = requests.get('https://www.12306.cn',timeout = 0.0001)

4 超时设置

#上传文件#import requests#files = {#'file':open('a.jpg','rb')#}#response = requests.post('http://httpbin.org/post',files = files)#print(response.status_code)

5 上传文件

三、爬虫项目案例

#单线程爬取#import requests#import re#import os#

## 通用的通过地址获取页面内容的方法：#def get_page(url):#ret = requests.get(url)#if ret.status_code == 200:#return ret.text#

#def parse_res(text):## #urls = re.findall(r'class="categoryem".*?href="(.*?)" ',text,re.S)#print(urls)#for url in urls:#print(url)#yield 'https://www.pearvideo.com/' + url#

#def parse_detail(text):## print(text)#movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0]## print('视频文件的实际地址:',movie_url)#return movie_url#

#def base_dir():#base = os.path.dirname(os.path.abspath(__file__))#return base#

#def download_movie(url):#import time#movie_content = requests.get(url)#file_name = str(time.time())+'.mp4'#with open('%s/download/%s'%(base_dir(),file_name),'wb')as f:#f.write(movie_content.content)#

#if __name__ == '__main__':#res = get_page('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=1')## res是这个页面的内容#urls = parse_res(res)#for url in urls:#try:#res_detail = get_page(url)#movie_url = parse_detail(res_detail)#download_movie(movie_url)#except Exception as e:#print(e)

单线程爬梨视频

#多线程

importrequestsimportreimportosfrom concurrent.futures importThreadPoolExecutor#先生成一个有60个线程的池

pool = ThreadPoolExecutor(60)#通用的通过地址获取页面内容的方法：

defget_page(url):

ret=requests.get(url)if ret.status_code == 200:returnret.textdefparse_res(text):#从text中取出上个函数执行的返回结果

text =text.result()

urls= re.findall(r'class="categoryem".*?href="(.*?)"',text,re.S)print(urls)for url inurls:print(url)#yield 'https://www.pearvideo.com/' + url

pool.submit(get_page,'https://www.pearvideo.com/'+url).add_done_callback(parse_detail)defparse_detail(text):#print(text)

text =text.result()

movie_url= re.findall('srcUrl="(.*?)"',text,re.S)[0]#print('视频文件的实际地址:',movie_url)

pool.submit(download_movie,movie_url)defbase_dir():

base= os.path.dirname(os.path.abspath(__file__))

base= os.path.join(base,'download')returnbasedefdownload_movie(url):importtime

movie_content=requests.get(url)

file_name= str(time.time())+'.mp4'file=os.path.join(base_dir(),file_name)if movie_content.status_code == 200:

with open(file,'wb')as f:

f.write(movie_content.content)if __name__ == '__main__':for i in range(3):

url= 'https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=6&start=%s'%(i*12+1)#把获取页面的方法提交到线程中

#add_done_callback()当该线程执行完成后，执行的方法

pool.submit(get_page,url).add_done_callback(parse_res)

多线程爬梨视频

#导入 beautifulsoup模块

from bs4 importBeautifulSoupimporttimeimportos#https://www.autohome.com.cn/news/2/#liststart

for i in range(1,10):

url= 'https://www.autohome.com.cn/news/%s/#liststart'%i

ret=requests.get(url)#print(ret.text)

#soup = BeautifulSoup(ret.text,'lxml')

soup = BeautifulSoup(ret.text,'html.parser')

ul= soup.find(name='ul',attrs={'class':'article'})

li_list= ul.find_all(name='li')for li inli_list:try:#取出新闻的url

news_url = 'https:' + li.find(name='a').get('href') #取出属性

news_title = li.find(name='h3').text #取出h3标签的文本

news_desc = li.find(name='p').text #取出新闻的简介

news_img = 'https:' + li.find(name='img').get('src') #取到新闻的图片

print('''新闻标题：%s

新闻摘要：%s

新闻地址：%s

新闻图片地址：%s'''%(news_title,news_desc,news_url,news_img)

)#下载新闻的图片

response =requests.get(news_img)

time_name= str(time.time()) + '.jpg'base_path= os.path.dirname(os.path.abspath(__file__))

download_path= os.path.join(base_path,'download')

file_name=os.path.join(download_path,time_name)

with open(file_name,'wb')as f:

f.write(response.content)exceptException as e:print(e)

多线程爬汽车之家新闻

#import requests

#ret = requests.get('https://dig.chouti.com',#headers={#'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#}#)#

#print(ret.status_code)#print(ret.text)

#模拟登入，状态码：9999登入成功,并不能成功点赞#ret = requests.post('https://dig.chouti.com/login',#headers={#'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'#

#},#data={#'phone': '8618901847206',#'password': '123.abcd',#'oneMonth': 1#}#)## print(ret.text)## 取出登入成功后的cookie#cookie = ret.cookies.get_dict()#

## 给文章点赞，向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求#res = requests.post('https://dig.chouti.com/link/vote?linksId=25944651',#

#headers={#'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',#'referer':'https://dig.chouti.com/'#

#},#

#cookies = cookie#)#print(res.text)

抽屉自动点赞-分析

#第一步：先打开抽屉首页

importrequestsfrom bs4 importBeautifulSoup

ret= requests.get('https://dig.chouti.com/',

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',

)

ret_cookie=ret.cookies.get_dict()#第二步：模拟登入

res = requests.post('https://dig.chouti.com/login',

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'},

cookies=ret_cookie,

data={'phone': '8618901847206','password': '123.abcd','oneMonth': 1}

)print(res.text)

res_cookie=res.cookies.get_dict()## 第三步：给文章点赞(这个写死了，用下面那个第三步）#

## 给文章点赞，向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求#response = requests.post('https://dig.chouti.com/link/vote?linksId=25944651',#

#headers={#'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',#'referer':'https://dig.chouti.com/'#

#},#

#cookies = ret_cookie#)#print(response.text)

#登入成功第三步，进入首页

post_url_list=[]for i in range(5,10):

response= requests.get('https://dig.chouti.com/all/hot/recent/%s'%i,

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'}

)

soup= BeautifulSoup(response.text, 'html.parser')

div_all= soup.find(name='div',attrs={'class':'content-list','id':'content-list'})

div_list= div_all.find_all(name='div',attrs={'class':'news-pic'})for div indiv_list:try:

news_id= div.find(name='img').get('lang')#'https://dig.chouti.com/link/vote?linksId=%s'%news_id

post_url = 'https://dig.chouti.com/link/vote?linksId=%s'%news_id

post_url_list.append(post_url)exceptException as e:print('这里报错了哟',e)#print(post_url_list)

#第四步：循环给文章点赞#给文章点赞，向https://dig.chouti.com/link/vote?linksId=25944651 发送post请求

for url inpost_url_list:

up_news=requests.post(url,

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer':'https://dig.chouti.com/'},

cookies=ret_cookie

)print(up_news.text)

抽屉自动点赞-实现

#第一步：先打开抽屉首页

importrequestsfrom bs4 importBeautifulSoup

ret= requests.get('https://dig.chouti.com/',

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36',

)

ret_cookie=ret.cookies.get_dict()#第二步：模拟登入

res = requests.post('https://dig.chouti.com/login',

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'},

cookies=ret_cookie,

data={'phone': '8618901847206','password': '123.abcd','oneMonth': 1}

)print(res.text)

res_cookie=res.cookies.get_dict()#登入成功第三步，进入首页，获取新闻ID并拼接url加入列表

post_url_list=[]

news_id_list=[]for i in range(5, 10):

response= requests.get('https://dig.chouti.com/all/hot/recent/%s' %i,

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'}

)

soup= BeautifulSoup(response.text, 'html.parser')

div_all= soup.find(name='div', attrs={'class': 'content-list', 'id': 'content-list'})

div_list= div_all.find_all(name='div', attrs={'class': 'news-pic'})for div indiv_list:try:

news_id= div.find(name='img').get('lang')

news_id_list.append(news_id)#'https://dig.chouti.com/link/vote?linksId=%s'%news_id

post_url = 'https://dig.chouti.com/link/vote?linksId=%s' %news_id

post_url_list.append(post_url)exceptException as e:print('这里报错了哟', e)#第四步：循环给文章取消点赞#给文章取消点赞，向https://dig.chouti.com/vote/cancel/vote.do 发送post请求，并携带form_data= {linksId: 25933276}

url= 'https://dig.chouti.com/vote/cancel/vote.do'

for news_id innews_id_list:

up_news=requests.post(url,

headers={'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36','referer': 'https://dig.chouti.com/'},

cookies=ret_cookie,

data={'linksId': news_id

}

)print(up_news.text)

抽屉自动取消点赞-实现

四、bs4模块介绍

#html_doc = """#

The Dormouse's story##

The Dormouse's story

Once upon a time there were three little sisters; and their names were#Elsie,#Lacie and#Tillie;#and they lived at the bottom of a well.

...

#"""

#1、用法#from bs4 import BeautifulSoup#soup=BeautifulSoup(html_doc,'lxml')## soup=BeautifulSoup(open('a.html'),'lxml')#

#print(soup.p) #存在多个相同的标签则只返回第一个#print(soup.p.b.text) #The Dormouse's story#print(soup.p.b.get('class')) #['boldest']#print(soup.a) #存在多个相同的标签则只返回第一个#

##2、获取标签的名称#print(soup.p.name)#

##3、获取标签的属性#print(soup.p.attrs)#

##4、获取标签的内容#print(soup.p.string) # p下的文本只有一个时，取到，否则为None#print(soup.p.strings) #拿到一个生成器对象, 取到p下所有的文本内容#print(soup.p.text) #取到p下所有的文本内容#for line in soup.stripped_strings: #去掉空白#print(line)#

#'''#如果tag包含了多个子节点,tag就无法确定 .string 方法应该调用哪个子节点的内容, .string 的输出结果是 None，如果只有一个子节点那么就输出该子节点的文本，比如下面的这种结构，soup.p.string 返回为None,但soup.p.strings就可以找到所有文本#

#哈哈哈哈###

aaaa

### bbbbb##'''#

##5、嵌套选择#print(soup.head.title.string)#print(soup.body.a.string)#

##6、子节点、子孙节点#print(soup.p.contents) #p下所有子节点#print(soup.p.children) #得到一个迭代器,包含p下所有子节点#

#for i,child in enumerate(soup.p.children):#print(i,child)#

#print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来#for i,child in enumerate(soup.p.descendants):#print(i,child)#

##7、父节点、祖先节点#print(soup.a.parent) #获取a标签的父节点#print(soup.a.parents) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...#

##8、兄弟节点#print('=====>')#print(soup.a.next_sibling) #下一个兄弟#print(soup.a.previous_sibling) #上一个兄弟#

#print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象#print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象

1 遍厉文档树

html_doc = """

The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were

Elsie,

Lacie and

Tillie;

and they lived at the bottom of a well.

...

"""

#五种过滤器：字符串，正则表达式，列表， True，方法#soup.find()#find(self, name=None, attrs={}, recursive=True, text=None,**kwargs):#name：标签名， attrs：属性， text：文本内容， recursive=False表示不递归查找、默认是True， **kwargs

from bs4 importBeautifulSoup#1 字符串精准匹配#soup = BeautifulSoup(html_doc,'lxml')## ret = soup.find(name='body') #指定条件1## ret = soup.find(attrs={'class':'title'}) #指定条件2#ret = soup.find(text="The Dormouse's story") #指定条件3#print(ret)#print(type(ret))

#2 正则表达式匹配#import re#soup = BeautifulSoup(html_doc,'lxml')## ret = soup.find(name=re.compile('^p'))## ret = soup.find(attrs={'class':re.compile('^s')})#ret = soup.find(name='a',text=re.compile('^L'))#print(ret)

#3 列表匹配

soup = BeautifulSoup(html_doc,'lxml')#ret =soup.find_all(name=['a','b'])#ret =soup.find_all(attrs={'class':['title','sister']})

ret =soup.find_all(text=['Elsie','Lacie'])print(ret)

2 搜索文档树

五、selenium模块介绍

#最基本的用法#from selenium import webdriver#import time#

##webdriver.Chrome()得到一个对象，相当于我的浏览器#browser = webdriver.Chrome()#browser.get('https://www.baidu.com')#print(browser.page_source)#

#time.sleep(2)#

##关闭浏览器（务必）#browser.close()

#from selenium import webdriver#import time

基本使用

#### 所有选择器用法

#1、find_element_by_id 通过id查找

#2、find_element_by_link_text 通过连接文字

#3、find_element_by_partial_link_text 通过连接文字模糊查找

#4、find_element_by_tag_name 通过标签查找

#5、find_element_by_class_name 通过类名查找

#6、find_element_by_name

#7、find_element_by_css_selector

#8、find_element_by_xpath#### 所有选择器用用法

选择器介绍

#简单使用1 打开百度，在百度搜索栏中搜索美女关键字#try:#browser = webdriver.Chrome()#browser.get('https://www.baidu.com')#

#time.sleep(2)#

#search_input = browser.find_element_by_id('kw')#key = input('请输入要搜索的内容')#search_input.send_keys(key)#time.sleep(5)#

#except Exception as e:#print(e)#finally:#browser.close()

简单使用-1

#简单使用2 打开百度并完成登入#try:#browser = webdriver.Chrome()## 表示取控件的时候，如果取不到，则等待3秒## 隐式等待#browser.implicitly_wait(3)#browser.get('https://www.baidu.com')#

#time.sleep(2)#

#login_btn = browser.find_element_by_link_text('登录')#login_btn.click()#user_login = browser.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')#user_login.click()#username_input = browser.find_element_by_id('TANGRAM__PSP_10__userName')#username_input.send_keys('13681878977')#password_input = browser.find_element_by_id('TANGRAM__PSP_10__password')#password_input.send_keys('zhang319!')#submit_btn = browser.find_element_by_id('TANGRAM__PSP_10__submit')#submit_btn.click()#time.sleep(5)#

#search_input = browser.find_element_by_id('kw')#search_input.send_keys('名侦探柯南')#time.sleep(10)#

#except Exception as e:#print(e)#finally:#browser.close()

简单使用-2

#简单使用3 爬取京东商品信息

#from selenium import webdriver#from selenium.webdriver.common.keys import Keys#import time#

#def get_goods(browser):#li_list = browser.find_elements_by_class_name('gl-item')#for li in li_list:#goods_price = li.find_element_by_css_selector('.p-price i').text## print(goods_price)#

#goods_comment = li.find_element_by_css_selector('.p-commit strong a').text## print(goods_comment)#

#goods_name = li.find_element_by_css_selector('.p-name-type-2 a').get_attribute('title')## print(goods_name)#

#goods_url = li.find_element_by_css_selector('.p-name-type-2 a').get_attribute('href')#

#goods_img = li.find_element_by_css_selector('.p-img a img').get_attribute('src')#if not goods_img:#goods_img = 'https:'+li.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')#

#print(#'''#商品名称：%s#商品价格：%s#商品评论数：%s#商品详情地址：%s#商品图片地址：%s#'''% (goods_name, goods_price, goods_comment, goods_url, goods_img)#)#

#next_page = browser.find_element_by_partial_link_text('下一页')#time.sleep(2)#next_page.click()#get_goods(browser)#

#def spider():#try:#browser = webdriver.Chrome()#browser.implicitly_wait(3)#

#browser.get('https://www.jd.com')#

#search_input = browser.find_element_by_id('key')#search_input.send_keys('手机')#search_input.send_keys(Keys.ENTER)#time.sleep(5)#

##取出页面中商品信息#get_goods(browser)#

#except Exception as e:#print(e)#finally:#browser.close()#

#if __name__ == '__main__':#spider()

简单使用-3 (爬取京东商品信息)

#简单实用4 模拟浏览器前进后退#import time#from selenium import webdriver#

#browser=webdriver.Chrome()#browser.get('https://www.baidu.com')#browser.get('https://www.taobao.com')#browser.get('http://www.sina.com.cn/')#

#browser.back()#time.sleep(10)#browser.forward()#browser.close()

#执行js代码#from selenium import webdriver#import time#

#browser=webdriver.Chrome()#browser.get('https://www.baidu.com')#browser.execute_script('alert(1234)')##执行后js代码后，.close()不能生效#browser.close()

#选项卡管理：切换选项卡，有js的方式windows.open,有windows快捷键：ctrl+t等，最通用的就是js的方式#import time#from selenium import webdriver#

#browser=webdriver.Chrome()#browser.get('https://www.baidu.com')#browser.execute_script('window.open()')#

#print(browser.window_handles) #获取所有的选项卡#browser.switch_to_window(browser.window_handles[1])#browser.get('https://www.taobao.com')#time.sleep(5)#browser.switch_to_window(browser.window_handles[0])#browser.get('https://www.sina.com.cn')#browser.close()

#控制鼠标滑动#from selenium import webdriver#from selenium.webdriver import ActionChains#from selenium.webdriver.common.by import By # 按照什么方式查找，By.ID,By.CSS_SELECTOR#from selenium.webdriver.common.keys import Keys # 键盘按键操作#from selenium.webdriver.support import expected_conditions as EC#from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素#import time#

#driver = webdriver.Chrome()#driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')#wait=WebDriverWait(driver,3)## driver.implicitly_wait(3) # 使用隐式等待#

#try:#driver.switch_to.frame('iframeResult') ##切换到iframeResult#sourse=driver.find_element_by_id('draggable')#target=driver.find_element_by_id('droppable')#

##方式一：基于同一个动作链串行执行## actions=ActionChains(driver) #拿到动作链对象## actions.drag_and_drop(sourse,target) #把动作放到动作链中，准备串行执行## actions.perform()#

##方式二：不同的动作链，每次移动的位移都不同#ActionChains(driver).click_and_hold(sourse).perform()#distance=target.location['x']-sourse.location['x']#

#track=0#while track < distance:#ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()#track+=2#

#ActionChains(driver).release().perform()#

#time.sleep(10)#

#finally:#driver.close()

简单使用-4 模拟浏览器操作

六、微信自动回复机器人

#好友性别饼状图#from wxpy import *#from pyecharts import Pie## import webbrowser#bot = Bot(cache_path=True) # 注意手机登陆确认#

## 拿到所有朋友对象，放到列表里#friends = bot.friends()#

#attr = ['男朋友','女朋友','未知性别']#value = [0,0,0]#

#for friend in friends:#if friend.sex == 1: #1代表男性#value[0] +=1#elif friend.sex == 2: #2代表女性#value[1] += 1#else:#value[2] += 1#

#pie = Pie('朋友男女比例')#pie.add("",attr, value, is_label_show=True)#

## 图表名称str ，属性名称list ，属性所对应的值list ， is_label_show是否现在标签#pie.render('sex.html')#

##打开浏览器#from selenium import webdriver#import time#browser = webdriver.Chrome()#browser.get('/Users/lich/PycharmProjects/w3spider_Proj/sex.html')#time.sleep(10)#browser.close()

好友统计饼状图

#好友地域省份分布图#from wxpy import *#from pyecharts import Map#from pyecharts import Pie#import webbrowser#bot = Bot(cache_path=True) # 注意手机登陆确认#

## 拿到所有朋友对象，放到列表里#friends = bot.friends()#

#area_dic = {} #定义一个省份字典#

#for friend in friends:#if friend.province not in area_dic:#area_dic[friend.province] = 1#else:#area_dic[friend.province] += 1#attr = area_dic.keys()#value = area_dic.values()#

#map = Map('好朋友们的地域分布', width= 1200,height=600)#map.add(#"好友地域分布",#attr,#value,#maptype='china',#is_visualmap = True, #结合体VisualMap#

#)#

## is_visualmap -> bool 是否使用视觉映射组件#map.render('area.html')

好友省份地域分布图

#全好友自动回复：#from wxpy import *#bot=Bot(cache_path=True)#

#@bot.register()#def recv_send_msg(recv_msg):#print('收到的消息：',recv_msg.text) # recv_msg.text取得文本#return '自动回复：%s' %recv_msg.text#

## 进入Python命令行，让程序保持运行#embed()

全好友自动回复

#自动给老婆回复信息#from wxpy import *#bot=Bot(cache_path=True)#

#girl_friend=bot.search('老婆')[0]#print(girl_friend)#

#@bot.register() # 接收从指定好友发来的消息，发送者即recv_msg.sender为指定好友girl_friend#def recv_send_msg(recv_msg):## print('收到的消息：',recv_msg.text) # recv_msg.text取得文本#if recv_msg.sender == girl_friend:#recv_msg.forward(bot.file_helper,prefix='老婆留言: ') #在文件传输助手里留一份，方便自己忙完了回头查看#ms='老婆最美丽，我对老婆的爱如滔滔江水，连绵不绝'#print('>>>给老婆回复的：', ms)#return ms#给老婆回一份#

#embed()

自动给老婆回复-1

#使用图灵机器人自动回复

importjsonimportrequestsfrom wxpy import *bot= Bot(cache_path=True)#调用图灵机器人API，发送消息并获得机器人的回复

defauto_reply(text):

url= "http://www.tuling123.com/openapi/api"api_key= "9df516a74fc443769b233b01e8536a42"payload={"key": api_key,"info": text,

}

r= requests.post(url, data=json.dumps(payload))

result=json.loads(r.content)#return "[来自智能机器人] " + result["text"]

return result["text"]

girl_friend=bot.search('老婆')[0]

@bot.register()defforward_message(msg):if msg.sender ==girl_friend:returnauto_reply(msg.text)

embed()

自动给老婆回复-2 (使用图灵机器人)

weixin_39982452

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python3简单爬虫_爬虫使用-简单(python3入门)

爬虫是什么？- 每个网站都有爬虫协议，(例如：https://www.baidu.com/robots.txt，这里会写清楚哪些允许哪些不被允许)- 可见即可爬（技术上）- 违法的：擦边球一、request模块（模拟发请求的模块）- 安装：pip3 install requests. ---urllib,urllib2 （这两个是py内置的），requests模块是基于这两个模块封装的#***...
复制链接

扫一扫