最近学习了一下python 写了一小端代码 练习一下
一、获取网页数据(html)
url:网址 headers:请求头部信息 (见下图)
找了一个 常用的 User-Agent:
headers = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
]
一般我们通过
'''
获取html页面
'''
def get_html(url,headers):
req = urllib2.Request(url)
header = random.choice(headers)
req.add_header('User-Agent', header)
req.add_header('Sec-Fetch-User', '?1')
req.add_header('Sec-Fetch-Site', 'none')
req.add_header('Sec-Fetch-Mode', 'navigate')
html = urllib2.urlopen(req).read()
return html
二、将获取的html转化为可用的数据 (使用的是 xpath)
google xpath插件可以安装一下 方便你使用xpath
soup.xpath('//div[@class="witkey-item-top "]') 里面是你要截取的数据 这个根据你的需要改变哦!
'''
获取html页面内数据
'''
def get_page_data(html):
soup = etree.HTML(html)
div_list = soup.xpath('//div[@class="class"]')
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f:
for div in div_list:
title = div.xpath('.//div[@class="class"]/text()')[0]
f.write('{}\n'.format(title));
'''
三、创建csv文件 一定要先创建csv文件再去写入数据
'''
生成csv
'''
def creat_csv():
csv_headers = ['标题']
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f:
f_csv = csv.writer(f)
f_csv.writerow(csv_headers)
# f_csv.writerows()
f.close()
三、记得引入用的模块
import urllib2
import random
import csv
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from lxml import etree
四、还是把完整代码写一下吧 里面的参数需要替换成你自己的哦
# -*- coding:utf-8 -*-
import urllib2
import random
import csv
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from lxml import etree
headers = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
]
'''
获取html页面
'''
def get_html(url,headers):
req = urllib2.Request(url)
header = random.choice(headers)
req.add_header('User-Agent', header)
req.add_header('Sec-Fetch-User', '?1')
req.add_header('Sec-Fetch-Site', 'none')
req.add_header('Sec-Fetch-Mode', 'navigate')
html = urllib2.urlopen(req).read()
return html
'''
生成csv
'''
def creat_csv():
csv_headers = ['名称', '网址']
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f:
f_csv = csv.writer(f)
f_csv.writerow(csv_headers)
# f_csv.writerows()
f.close()
'''
获取html页面内数据
'''
def get_page_data(html):
soup = etree.HTML(html)
div_list = soup.xpath('//div[@class="你的类属性"]')
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f:
for div in div_list:
title = div.xpath('.//div[@class="你的类属性"]/a/text()')[0]
link= div.xpath('.//div[@class="你的类属性"]/a/@herf')[0]
f.write('{},{}\n'.format(title, link));
'''
主函数
'''
def main():
num = input('请输入你要爬取的页数');
keyword = raw_input('请输入你要爬取关键词');
keyword = urllib2.quote(keyword)
for i in range(int(num)):
page = (i-1)*5 + i*65
if page < 0:
page = 0
url = '你的地址?page={}&key={}'.format(page,keyword)
html = get_html(url,headers)
get_page_data(html)
creat_csv() #创建csv
main() #主函数
五、如果有好的方法、逻辑 ,欢迎留言 沟通 指教