面向对象编写爬虫
注意点
1 format 传参
2 生成 index 区分文件名
3 面向对象的思维,获取数据,保存数据,运行三部分分开
4 sys.argv 命令行传参
示例代码
import requests
import time
import sys
class Tieba(object):
def __init__(self, name):
self.name = name
self.base_url = "http://tieba.baidu.com/f?ie=utf-8&kw={}&ie=utf-8&pn=".format(self.name)
self.headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
self.url_list = [self.base_url + str(i) for i in range(10)]
def get_data(self, url):
response = requests.get(url, headers=self.headers)
return response.content
# def save_data(self, content):
# filename = self.name + str(time.time()) + ".html"
# with open(filename,'w') as f:
# f.write(content)
def save_data(self, content, index):
filename = self.name + str(index) + '.html'
with open(filename, 'wb') as f:
f.write(content)
def run(self):
for url in self.url_list:
data = self.get_data(url)
index = self.url_list.index(url) + 1
self.save_data(data, index)
if __name__ == '__main__':
name = sys.argv[1]
tieba = Tieba(name)
tieba.run()
关于post的编解码问题
注意点
1 传递的data 参数是个字典
2 对content 首先进行解码 decode
3 对str 进行json.loads
4 从最后得到的字典中去取值
示例代码
import requests
import sys
import json
class Ciba(object):
def __init__(self, word):
self.word = word
self.url = 'http://fy.iciba.com/ajax.php?a=fy'
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
def get_data(self):
post_data = {
'f':'auto',
't':'auto',
'w':self.word,
}
response = requests.post(self.url, data=post_data, headers=self.headers)
return response.content
def run(self):
print(self.get_data())
print(type(self.get_data()))
res = self.get_data().decode()
print(type(res))
res2 = json.loads(res)
print(res2.get('content').get('out'))
if __name__ == "__main__":
word = sys.argv[1]
ciba = Ciba(word)
ciba.run()
使用代理
注意点
http 和 https 不能同时开启
付费代理的使用格式
示例代码
import requests
url = 'https://www.taobao.com'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
proxies = {
'http':'http://183.163.40.223:31773',
# 'http':'http://user:pwd@183.163.40.223:31773',
'https':''
}
response = requests.get(url, proxies=proxies, headers=headers)
print(response.status_code)
有关cookie 和session
注意点
1 键值对的键名是 cookie 而不是 cookies
2 url 需要直接写到 访问私人信息的页面
3 cookie 的两种设置方式
4 正则匹配 re.findall 的用法 结果是返回一个列表
示例代码
import requests
import re
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'cookie':'anonymid=jawg9kd7k1zz09; _r01_=1; _ga=GA1.2.48173182.1525587736; __utma=151146938.48173182.1525587736.1526023590.'
'1526023590.1; __utmz=151146938.1526023590.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/SysHome.do;'
' depovince=GW; jebecookies=dd54997e-141e-4db1-b3ac-5f7919c121a9|||||; JSESSIONID=abcb0QeuehygE7d76WGtw; ick_login='
'48f4d146-3fe3-4806-a2a9-0762e7c742f9; _de=82D006EDB0340D0076B255B13038CCD8; p=0030030751d26caff8c1ffe172f1122d8; '
'first_login_flag=1; ln_uact=15626046299; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=cb4639287e91a4a'
'964a224ea50cd929f8; societyguester=cb4639287e91a4a964a224ea50cd929f8; id=965882188; xnsid=1f3be1d7; ver=7.0; '
'loginfrom=null; wp_fold=0'
}
url = 'https://www.renren.com/965882188'
response = requests.get(url, headers=headers)
print(response.status_code)
print()
data = response.content.decode()
print(re.findall("新用户",data))
print(response.url)
示例代码2
# coding:utf-8
import requests
import re
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
temp = 'anonymid=jawg9kd7k1zz09; _r01_=1; _ga=GA1.2.48173182.1525587736; __utma=151146938.48173182.1525587736.1526023590.1526023590.1; __utmz=151146938.1526023590.1.1.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/SysHome.do; depovince=GW; jebecookies=dd54997e-141e-4db1-b3ac-5f7919c121a9|||||; JSESSIONID=abcb0QeuehygE7d76WGtw; ick_login=48f4d146-3fe3-4806-a2a9-0762e7c742f9; _de=82D006EDB0340D0076B255B13038CCD8; p=0030030751d26caff8c1ffe172f1122d8; first_login_flag=1; ln_uact=15626046299; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=cb4639287e91a4a964a224ea50cd929f8; societyguester=cb4639287e91a4a964a224ea50cd929f8; id=965882188; xnsid=1f3be1d7; ver=7.0; loginfrom=null; wp_fold=0'
temp_list = temp.split('; ')
cookies = {}
for temp_ in temp_list:
key = temp_.split("=",1)[0]
value = temp_.split("=",1)[1]
cookies[key] = value
url = 'https://www.renren.com/965882188'
response = requests.get(url, cookies=cookies, headers=headers)
print(response.status_code)
data = response.content.decode()
print(re.findall("新用户",data))
print(response.url)