Python 定向爬虫入门—Python(三)

安装lxml(flask等)

  • easy_install lxml (不推荐)
  • pip install lxml
  • 手动安装 下载lxml库( http://www.lfd.uci.edu/~gohlke/pythonlibs/ )— 后缀名改为zip,解压— 将lxml文件夹放入C:\Python27\Lib中— 在.py 中import lxml 看报不报错

Python 定向爬虫入门

基本的正则表达式

  • .* 贪心(一次尽可能吃最多的)
  • .*? 非贪心(少量多餐,找出满足条件的并且找出的个数最多)
  • (.*?) 非贪心并输出需要的
  • re.S 可匹配换行符

这里写图片描述

#-*-coding:utf8-*-
#导入re库文件
import re
# from re import findall,search,S
secret_code = 'hadkfalifexxIxxfasdjifja134xxlovexx23345sdfxxyouxx8dfse'
#.的使用举例
# a = 'xy123'
# b = re.findall('x...',a)
# print b
#*的使用举例
# a = 'xyxy123'
# b = re.findall('x*',a)
# print b
#?的使用举例
# a = 'xy123'
# b = re.findall('x?',a)
# print b
'''上面的内容全部都是只需要了解即可,需要掌握的只有下面这一种组合方式(.*?)'''
# #.*的使用举例
# b = re.findall('xx.*xx',secret_code)
# print b
# # #.*?的使用举例
# c = re.findall('xx.*?xx',secret_code)
# print c
#
#
#
# #使用括号与不使用括号的差别
# d = re.findall('xx(.*?)xx',secret_code)
# print d
# for each in d:
#     print each
# s = '''sdfxxhello
# xxfsdfxxworldxxasdf'''
#
# d = re.findall('xx(.*?)xx',s,re.S)
# print d
#对比findall与search的区别
# s2 = 'asdfxxIxx123xxlovexxdfd'
# # f = re.search('xx(.*?)xx123xx(.*?)xx',s2).group(2)
# # print f
# f2 = re.findall('xx(.*?)xx123xx(.*?)xx',s2)
# print f2[0][1]
#sub的使用举例
# s = '123rrrrr123'
# output = re.sub('123(.*?)123','123%d123'%789,s)
# print output
#演示不同的导入方法
# info = findall('xx(.*?)xx',secret_code,S)
# for each in info:
#     print each
#不要使用compile
# pattern = 'xx(.*?)xx'
# new_pattern = re.compile(pattern,re.S)
# output = re.findall(new_pattern,secret_code)
# print output
#匹配数字
a = 'asdfasf1234567fasd555fas'
b = re.findall('(\d+)',a)
print b

爬图片

#-*-coding:utf8-*-
import re
import requests
#读取源代码文件
f = open('source.txt','r')
html = f.read()
f.close()
#匹配图片网址
pic_url = re.findall('img src="(.*?)" class="lessonimg"',html,re.S)
i = 0
for each in pic_url:
    print 'now downloading:' + each
    pic = requests.get(each)
    fp = open('pic\\' + str(i) + '.jpg','wb')
    fp.write(pic.content)
    fp.close()
    i += 1

实战——制作文本爬虫

#-*-coding:utf8-*-
#导入re库文件
import re
old_url = 'http://www.jikexueyuan.com/course/android/?pageNum=2'
total_page = 20
f = open('text.txt','r')
html = f.read()
f.close()
#爬取标题
# title = re.search('<title>(.*?)</title>',html,re.S).group(1)
# print title
#爬取链接
# links = re.findall('href="(.*?)"',html,re.S)
# for each in links:
#     print each
#抓取部分文字,先大再小
# text_fied = re.findall('<ul>(.*?)</ul>',html,re.S)[0]
# the_text = re.findall('">(.*?)</a>',text_fied,re.S)
# for every_text in the_text:
#     print every_text
#sub实现翻页
for i in range(2,total_page+1):
    new_link = re.sub('pageNum=\d+','pageNum=%d'%i,old_url,re.S)
    print new_link

代码转自极客学院

Python 单线程爬虫

修改http头获取源代码 Requests获取网页源代码

#hea是我们自己构造的一个字典,里面保存了user-agent(Chrome-审核元素-Network-headers里找user-agent)
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get('http://jp.tingroom.com/yuedu/yd300p/',headers = hea)
chinese = re.findall('color: #039;">(.*?)</a>',html.text,re.S)
for each in chinese:
    print each

向网页提交数据

Get与Post介绍
  • Get是从服务器上获取数据
  • Post是向服务器传送数据
  • Get通过构造url中的参数来实现功能
  • Post将数据放在header提交数据

  • 分析工具:Chrome-审核元素-Network

Requests表单提交
  • 核心方法:request.post
  • 核心步骤:构造表单-提交表单-获取返回信息
#-*-coding:utf8-*-
import requests
import re
url = 'https://www.crowdfunder.com/browse/deals&template=false'
#注意这里的page后面跟的数字需要放到引号里面。
data = {
    'entities_only':'true',
    'page':'2'
}
html_post = requests.post(url,data=data)
title = re.findall('"card-title">(.*?)</div>',html_post.text,re.S)
for each in title:
    print each
极客学院课程爬虫 G:\Python\4、Python 定向爬虫入门\2、Python 单线程爬虫\Single-thread-crawler\单线程爬虫-上线资源包\源码 jikexueyuan.py
  • # 在python中字符串默认采用的ASCII编码,如果要显示声明为unicode类型的话,需要在字符串前面加上’u’或者’U’
  • print u’开始爬取内容。。。’

XPath 与多线程爬虫

获取网页元素的XPath

  • 手动分析法
  • Chrome生成法 F12-右键-copy XPath

应用XPath提取内容

  • //定位根节点
  • /往下层寻找
  • 提取文本内容:/text()
  • 提取属性内容: /@xxxx

神器XPath的特殊用法

  • 以相同的字符开头 starts-with(@属性名称, 属性字符相同部分)
  • 标签套标签 string(.)
#-*-coding:utf8-*-
from lxml import etree
html1 = '''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title></title>
</head>
<body>
    <div id="test-1">需要的内容1</div>
    <div id="test-2">需要的内容2</div>
    <div id="testfault">需要的内容3</div>
</body>
</html>
'''
html2 = '''
<!DOCTYPE html>
<html>
<head lang="en">
    <meta charset="UTF-8">
    <title></title>
</head>
<body>
    <div id="test3">
        我左青龙,
        <span id="tiger">
            右白虎,
            <ul>上朱雀,
                <li>下玄武。</li>
            </ul>
            老牛在当中,
        </span>
        龙头在胸口。
    </div>
</body>
</html>
'''
selector = etree.HTML(html1)
content = selector.xpath('//div[starts-with(@id,"test")]/text()')
for each in content:
    print each
data = selector.xpath('//div[@id="test3"]')[0]
info = data.xpath('string(.)')
content_2 = info.replace('\n','').replace(' ','')
print content_2

Python 并行化介绍与演示

map 函数一手包办了序列操作、参数传递和结果保存等一系列的操作。
from multiprocessing.dummy import Pool
pool = Pool(4)
results = pool.map(爬取函数, 网址列表)

#-*-coding:utf8-*-
from multiprocessing.dummy import Pool as ThreadPool
import requests
import time
def getsource(url):
    html = requests.get(url)
urls = []
for i in range(1,21):
    newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
    urls.append(newpage)
time1 = time.time()
for i in urls:
    print i
    getsource(i)
time2 = time.time()
print u'单线程耗时:' + str(time2-time1)
pool = ThreadPool(4)
time3 = time.time()
results = pool.map(getsource, urls)
pool.close()
pool.join()
time4 = time.time()
print u'并行耗时:' + str(time4-time3)

实战——百度贴吧爬虫

#-*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
'''重新运行之前请删除content.txt,因为文件操作使用追加方式,会导致内容太多。'''
def towrite(contentdict):
    f.writelines(u'回帖时间:' + str(contentdict['topic_reply_time']) + '\n')
    f.writelines(u'回帖内容:' + unicode(contentdict['topic_reply_content']) + '\n')
    f.writelines(u'回帖人:' + contentdict['user_name'] + '\n\n')
def spider(url):
    html = requests.get(url)
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//div[@class="l_post l_post_bright "]')
    item = {}
    for each in content_field:
        reply_info = json.loads(each.xpath('@data-field')[0].replace('&quot',''))
        author = reply_info['author']['user_name']
        content = each.xpath('div[@class="d_post_content_main"]/div/cc/div[@class="d_post_content j_d_post_content "]/text()')[0]
        reply_time = reply_info['content']['date']
        print content
        print reply_time
        print author
        item['user_name'] = author
        item['topic_reply_content'] = content
        item['topic_reply_time'] = reply_time
        towrite(item)
if __name__ == '__main__':
    pool = ThreadPool(4)
    f = open('content.txt','a')
    page = []
    for i in range(1,21):
        newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i)
        page.append(newpage)
    results = pool.map(spider, page)
    pool.close()
    pool.join()
    f.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值