python爬取百度标题_python编写的一个爬取补天厂商标题对应百度查找主域名的一个收集脚本...

该博客展示了如何使用Python进行网络爬虫实践,包括访问360论坛获取公司名信息,并利用这些信息进行百度搜索。代码中定义了两个类Butian和Baidu,分别用于爬取360论坛和百度搜索。Butian类从360论坛获取公司名列表,Baidu类则根据这些公司名在百度上进行搜索并保存结果到txt文件。
摘要由CSDN通过智能技术生成

[Python] 纯文本查看 复制代码#coing=utf-8

#author:Liod

import requests,re,json

class butian(object):

def __init__(self, page):

self.page = page

self.butian_url = "http://loudong.360.cn/Reward/pub"

#self.proxies = {"http":"113.214.13.1:8000"}

self.data = {

"s":1,

"p":self.page,

"token":""

}

def bananer(self):

page = self.page

self.header = {

"Cookie":"", #COOKIE

"Host":"loudong.360.cn",

"Referer":"http://loudong.360.cn/Service",

"User-Agent":"Mozilla/5.0 (Linux; U; Android 5.1; zh-cn; m1 metal Build/LMY47I) AppleWebKit/537.36 (KHTML, like Gecko)Version/4.0 Chrome/37.0.0.0 MQQBrowser/7.6 Mobile Safari/537.36",

"Origin":"http://loudong.360.cn",

"Accept":"application/json, text/javascript, */*; q=0.01",

"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",

"X-Requested-With":"XMLHttpRequest",

"Accept-Encoding":"gzip, deflate",

"Content-Length":'14',

"Connection":"keep-alive",

"Accept-Language":"zh-CN,zh;q=0.8"

}

return self.header

def butianjson(self):

self.res = requests.post("http://loudong.360.cn/Reward/pub", headers = self.bananer(), data = self.data)

print self.res.content

self.content = json.loads(self.res.content)

result = []

for i in range(0, len(self.content["data"]["list"])-1):

result.append(self.content["data"]["list"][i]["company_name"])

return result

class baidu(object):

def __init__(self):

self.url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E5%B9%BF%E5%B7%9E%E8%A7%86%E6%BA%90%E8%82%A1%E4%BB%BD%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8"

self.bananer = {

"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36",

"Cookie":"BAIDUID=A8AC42B1F46CDE7379A037C75CB62819:FG=1; BIDUPSID=A8AC42B1F46CDE7379A037C75CB62819; PSTM=1509928743; BDSFRCVID=W2AsJeCCxG3wqIbA3H_73bWlRYwArbZtRVBJ3J; H_BDCLCKID_SF=tRk8oDDafCvbfP0k54r-hICShUFX5-CsQbrCQhcH0hOWsIO6KfrDLjtnBNte5qbQLH5f54otytbCSlo_DUC0-nDSHHK8Jj8O3J; BD_UPN=123353; H_PS_645EC=87d0k6j1zJCm9Ri%2Fyz1u3cOEnpeK5T6s2yB7SB5VJZU3itkGx%2FAeu%2BGEwAs; BD_CK_SAM=1; PSINO=2; BDSVRTM=159; H_PS_PSSID=1426_12896_21106_17001_24879; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598"

}

self.proxies = {"http": "113.214.13.1:8000"}

def save_txt(self, url):

file = open("test111saa.txt", "a+")

file.write("%s\r\n"%url)

file.close()

def connect_baidu(self, url):

self.url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%s"%url

self.res = requests.get(self.url, headers = self.bananer, proxies = self.proxies, timeout=10)

self.result = re.findall(r'

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值