Some examples about perl expression

详见:http://www2.sas.com/proceedings/sugi29/265-29.pdf

 

***Primary functions: PRXPARSE, PRXMATCH;
/*Program 1: Using a Perl regular expression to locate lines with an exact text match*/
DATA _NULL_;
TITLE "Perl Regular Expression Tutorial – Program 1";
IF _N_ = 1 THEN PATTERN_NUM = PRXPARSE("/cat/");/*prxparse:定义正则表达*/
RETAIN PATTERN_NUM;
INPUT STRING $30.;
POSITION = PRXMATCH(PATTERN_NUM,STRING);/*prxmatch:定义文本形式*/
FILE PRINT;
PUT PATTERN_NUM= STRING= POSITION=;
DATALINES;
There is a cat in this line.
Does not match CAT
cat in the beginning
At the end, a cat
cat
;

/*Program 2: Using a regular expression to search for phone numbers in a string*/
***Primary functions: PRXPARSE, PRXMATCH;
DATA PHONE;
IF _N_ = 1 THEN PATTERN = PRXPARSE("/\(\d\d\d\) ?\d\d\d-\d{4}/");
***Regular expression will match any phone number in the form:
(nnn)nnn-nnnn or (nnn) nnn-nnnn.;
/*
\( matches a left parenthesis
\d\d\d matches any three digits
(blank)? matches zero or one blank
\d\d\d matches any three digits
- matches a dash
\d{4} matches any four digits
*/
RETAIN PATTERN;
INPUT STRING $CHAR40.;
IF PRXMATCH(PATTERN,STRING) GT 0 THEN OUTPUT;
DATALINES;
One number (123)333-4444
Two here:(800)234-2222 and (908) 444-2344
None here
;
PROC PRINT DATA=PHONE NOOBS;
TITLE "Listing of Data Set Phone";
RUN;

/*Program 3: Modifying Program 2 to search for toll-free phone numbers*/
***Primary functions: PRXPARSE, PRXMATCH
***Other function: MISSING;
DATA TOLL_FREE;
IF _N_ = 1 THEN DO
RE = PRXPARSE("/\(8(00|77|87)\) ?\d\d\d-\d{4}\b/");
***Regular expression looks for phone numbers of the form:
(nnn)nnn-nnnn or (nnn) nnn-nnnn. In addition the first
digit of the area code must be an 8 and the next two
digits must be either a 00, 77, or 87.;
IF MISSING(RE) THEN DO;
PUT "ERROR IN COMPILING REGULAR EXPRESSION";
STOP;
END;
END;
RETAIN RE;
INPUT STRING $CHAR80.;
POSITION = PRXMATCH(RE,STRING);
IF POSITION GT 0 THEN OUTPUT;
DATALINES;
One number on this line (877)234-8765
No numbers here
One toll free, one not:(908)782-6354 and (800)876-3333 xxx
Two toll free:(800)282-3454 and (887) 858-1234
No toll free here (609)848-9999 and (908) 345-2222
;
PROC PRINT DATA=TOLL_FREE NOOBS;
TITLE "Listing of Data Set TOLL_FREE";
RUN;

/*Program 4: Using PRXMATCH without PRXPARSE (entering the regular expression directly in the function)*/
***Primary functions: PRXMATCH;
DATA MATCH_IT;
INPUT @1 STRING $20.;
POSITION = PRXMATCH("/\d\d\d/",STRING);
DATALINES;
LINE 345 IS HERE
NONE HERE
ABC1234567
;
PROC PRINT DATA=MATCH_IT NOOBS;
TITLE "Listing of Data Set MATCH_IT";
RUN;

/*Program 5: Locating all 5- or 9-digit zip codes in a list of addresses*/
***Primary functions: PRXPARSE and PRXSUBSTR
***Other functions: SUBSTRN;
DATA ZIPCODE;
IF _N_ = 1 THEN RE = PRXPARSE("/ \d{5}(-\d{4})?/");
RETAIN RE;
/*
Match a blank followed by 5 digits followed by
either nothing or a dash and 4 digits
\d{5} matches 5 digits
- matches a dash
\d{4} matches 4 digits
? matches zero of one of the preceding subexpression
*/
INPUT STRING $80.;
LENGTH ZIP_CODE $ 10;
CALL PRXSUBSTR(RE,STRING,START,LENGTH);
IF START GT 0 THEN DO;
ZIP_CODE = SUBSTRN(STRING,START + 1,LENGTH - 1);
OUTPUT;
END;
KEEP ZIP_CODE;
DATALINES;
John Smith
12 Broad Street
Flemington, NJ 08822
Philip Judson
Apt #1, Building 7
777 Route 730
Kerrville, TX 78028
Dr. Roger Alan
44 Commonwealth Ave.
Boston, MA 02116-7364
;
PROC PRINT DATA=ZIPCODE NOOBS;
TITLE "Listing of Data Set ZIPCODE";
RUN;

/*Program 6: Extracting a phone number from a text string*/
***Primary functions: PRXPARSE, PRXSUBSTR
***Other functions: SUBSTR, COMPRESS, and MISSING;
DATA EXTRACT;
IF _N_ = 1 THEN DO;
PATTERN = PRXPARSE("/\(\d\d\d\) ?\d\d\d-\d{4}/");
IF MISSING(PATTERN) THEN DO;
PUT "ERROR IN COMPILING REGULAR EXPRESSION";
STOP;
END;
END;
RETAIN PATTERN;
LENGTH NUMBER $ 15;
INPUT STRING $CHAR80.;
CALL PRXSUBSTR(PATTERN,STRING,START,LENGTH);
IF START GT 0 THEN DO;
NUMBER = SUBSTRTRING,START,LENGTH);
(S NUMBER = COMPRESS(NUMBER," ");
OUTPUT;
END;
KEEP NUMBER;
DATALINES;
THIS LINE DOES NOT HAVE ANY PHONE NUMBERS ON IT
THIS LINE DOES: (123)345-4567 LA DI LA DI LA
ALSO VALID (123) 999-9999
TWO NUMBERS HERE (333)444-5555 AND (800)123-4567
;
PROC PRINT DATA=EXTRACT NOOBS;
TITLE "Extracted Phone Numbers";
RUN;

/*Program 7: Using the PRXPOSN function to extract the area code and exchange from a phone number*/
***Primary functions: PRXPARSE, PRXMATCH, PRXPOSN
***Other functions: SUBSTR;
RUN;
DATA PIECES;
IF _N_ THEN RE = PRXPARSE("/\((\d\d\d)\) ?(\d\d\d)-\d{4}/");
/*
\( matches an open parenthesis
\d\d\d matches three digits
\) matches a closed parenthesis
b? matches zero or more blanks (b = blank)
\d\d\d matches three digits
- matches a dash
\d{4} matches four digits
*/
RETAIN RE;
INPUT NUMBER $CHAR80.;
MATCH = PRXMATCH(RE,NUMBER);
IF MATCH GT 0 THEN DO;
CALL PRXPOSN(RE,1,AREA_START);
CALL PRXPOSN(RE,2,EX_START,EX_LENGTH);
AREA_CODE = SUBSTR(NUMBER,AREA_START,3);
EXCHANGE = SUBSTR(NUMBER,EX_START,EX_LENGTH);
END;
DROP RE;
DATALINES;
THIS LINE DOES NOT HAVE ANY PHONE NUMBERS ON IT
THIS LINE DOES: (123)345-4567 LA DI LA DI LA
ALSO VALID (609) 999-9999
TWO NUMBERS HERE (333)444-5555 AND (800)123-4567
;
PROC PRINT DATA=PIECES NOOBS HEADING=H;
TITLE "Listing of Data Set PIECES";
RUN;

/*Program 8: Using regular expressions to read very unstructured data*/
***Primary functions: PRSPARSE, PRXMATCH, PRXPOSN
***Other functions: SUBSTR, INPUT;
***This program will read every line of data and, for any line
that contains two or more numbers, will assign the first
number to X and the second number to Y;
DATA READ_NUM;
***Read the first number and second numbers on line;
IF _N_ = 1 THEN RET = PRXPARSE("/(\d+) +\D*(\d+)/");
/*
\d+ matches one or more digits
b+ matches one or more blanks (b = blank)
\D* matches zero or more non-digits
\d+ matches one or more digits
*/
RETAIN RET;
INPUT STRING $CHAR40.;
POS = PRXMATCH(RET,STRING);
IF POS GT 0 THEN DO;
CALL PRXPOSN(RET,1,START1,LENGTH1);
IF START1 GT 0 THEN X = INPUT(SUBSTR(STRING,START1,LENGTH1),9.);
CALL PRXPOSN(RET,2,START2,LENGTH2);
IF START2 GT 0 THEN Y = INPUT(SUBSTR(STRING,START2,LENGTH2),9.);
OUTPUT;
END;
KEEP STRING X Y;
DATALINES;
XXXXXXXXXXXXXXXXXX 9 XXXXXXX 123
This line has a 6 and a 123 in it
456 789
None on this line
Only one here: 77
;
PROC PRINT DATA=READ_NUM NOOBS;
TITLE "Listing of Data Set READ_NUM";
RUN;

/*Program 9: Finding digits in random positions in an input string using CALL PRXNEXT*/
***Primary functions: PRXPARSE, PRXNEXT;
DATA FIND_NUM;
IF _N_ = 1 THEN RET = PRXPARSE("/\d+/");
*Look for one or more digits in a row;
RETAIN RET;
INPUT STRING $40.;
START = 1;
STOP = LENGTH(STRING);
CALL PRXNEXT(RET,START,STOP,STRING,POSITION,LENGTH);
ARRAY X[5];
DO I = 1 TO 5 WHILE (POSITION GT 0);
X[I] = INPUT(SUBSTR(STRING,POSITION,LENGTH),9.);
CALL PRXNEXT(RET,START,STOP,STRING,POSITION,LENGTH);
END;
KEEP X1-X5 STRING;
DATALINES;
THIS 45 LINE 98 HAS 3 NUMBERS
NONE HERE
12 34 78 90
;
PROC PRINT DATA=FIND_NUM NOOBS;
TITLE "Listing of Data Set FIND_NUM";
RUN;

/*Program 10: Demonstrating the PRXPAREN function*/
***Primary functions: PRXPARSE, PRXMATCH, PRXPAREN;
DATA PAREN;
IF _N_ = 1 THEN PATTERN = PRXPARSE("/(\d )|(\d\d )|(\d\d\d )/");
***One or two or three digit number followed by a blank;
RETAIN PATTERN;
INPUT STRING $CHAR30.;
POSITION = PRXMATCH(PATTERN,STRING);
IF POSITION GT 0 THEN WHICH_PAREN = PRXPAREN(PATTERN);
DATALINES;
one single digit 8 here
two 888 77
12345 1234 123 12 1
;
PROC PRINT DATA=PAREN NOOBS;
TITLE "Listing of Data Set PAREN";
RUN;

/*Program 11: Demonstrating the PRXCHANGE function*/
***Primary functions: PRXPARSE, PRXCHANGE;
DATA CAT_AND_MOUSE;
INPUT TEXT $CHAR40.;
LENGTH NEW_TEXT $ 80;
IF _N_ = 1 THEN MATCH = PRXPARSE("s/[Cc]at/Mouse/");
*Replace "Cat" or "cat" with Mouse;
RETAIN MATCH;
CALL PRXCHANGE(MATCH,-1,TEXT,NEW_TEXT,R_LENGTH,TRUNC,N_OF_CHANGES);
IF TRUNC THEN PUT "Note: NEW_TEXT was truncated";
DATALINES;
The Cat in the hat
There are two cat cats in this line
;
PROC PRINT DATA=CAT_AND_MOUSE NOOBS;
TITLE "Listing of CAT_AND_MOUSE";
RUN;

/*Program 12: Demonstrating the use of capture buffers with PRXCHANGE*/
***Primary functions: PRXPARSE, PRXCHANGE;
DATA CAPTURE;
IF _N_ = 1 THEN RETURN = PRXPARSE("S/(\w+ +)(\w+)/$2 $1/");
RETAIN RETURN;
INPUT STRING $20.;
CALL PRXCHANGE(RETURN,-1,STRING);
DATALINES;
Ron Cody
Russell Lynn
;
PROC PRINT DATA=CAPTURE NOOBS;
TITLE "Listing of Data Set CAPTURE");
RUN;


 

### 回答1: Spark SQL是Apache Spark的一个模块,提供了基于SQL的数据处理功能。下面是一些Spark SQL的示例: 1. 创建DataFrame:可以通过从文件或其他数据源读取数据创建DataFrame,也可以从已有的RDD或DataFrame创建。 2. 注册表:将DataFrame注册为临时表,以便使用SQL语句进行查询。 3. 执行SQL查询:使用Spark SQL查询语句从表中查询数据。 4. 将结果保存到文件:使用Spark SQL将查询结果保存到文件中,可以是CSV、Parquet等格式。 5. 用户自定义函数:使用UDF(用户自定义函数)在SQL查询中进行自定义数据处理。 这些是Spark SQL的一些示例,您可以根据您的需求和数据处理要求进行相应的修改。 ### 回答2: Spark SQL是Spark中的一个模块,用于处理结构化数据,并且提供了类似于传统SQL的查询和分析功能。下面是一些关于Spark SQL的例子: 1. 查询数据:使用Spark SQL,可以通过SQL语句来查询结构化数据。例如,可以使用SELECT语句来选择指定的列,使用FROM来指定数据源,使用WHERE来添加过滤条件等。 2. 聚合操作:Spark SQL支持各种聚合操作,如求和、平均值、最大值、最小值等。可以使用GROUP BY子句将数据分组,并使用聚合函数对每个组进行计算。 3. 连接多个数据源:通过Spark SQL可以将不同的数据源连接在一起进行查询和分析。例如,可以将关系型数据库中的数据和Hadoop中的数据进行连接,并通过SQL语句进行数据查询。 4. 处理半结构化数据:Spark SQL还可以处理半结构化数据,如JSON和XML。可以通过将半结构化数据转换为表格形式,然后使用SQL语句进行查询和分析。 5. 与机器学习集成:Spark SQL可以与Spark的机器学习库集成,从而可以使用SQL语句进行数据预处理和特征工程等操作。例如,可以使用Spark SQL对数据进行清洗和转换,然后将其用于机器学习模型训练。 总之,Spark SQL提供了强大的查询和分析功能,可以方便地处理和查询结构化、半结构化的数据,并且可以与其他Spark模块和库集成,实现更丰富的数据分析和机器学习任务。 ### 回答3: Spark SQL是Apache Spark生态系统中的一个组件,它提供了一个允许开发人员使用SQL查询和操作结构化和半结构化数据的接口。以下是一些关于Spark SQL的例子: 1. 数据加载和读取:Spark SQL可以从各种数据源加载和读取数据。例如,可以使用Spark SQL从关系型数据库(如MySQL或PostgreSQL)中读取表数据。 2. 数据查询和过滤:使用Spark SQL,可以通过执行SQL查询和应用过滤器对数据进行查询和过滤。例如,可以编写一个SQL查询来获取所有销售额大于1000的订单。 3. 数据聚合和分组:Spark SQL支持基于列的聚合操作,如求和、平均值、最大值和最小值。可以使用Spark SQL将数据按照指定的列进行分组,然后对每个组应用聚合操作。 4. 数据转换和运算:使用Spark SQL可以对数据进行转换和运算。例如,可以使用Spark SQL添加新的列、删除旧的列、修改列数据类型等。 5. 数据写入和保存:Spark SQL可以将处理后的数据写入到不同的数据源中。例如,可以使用Spark SQL将查询结果保存到关系型数据库或者分布式文件系统中。 6. 数据库连接和操作:Spark SQL可以通过支持JDBC或ODBC连接到其他数据库,并执行数据操作。例如,可以使用Spark SQL连接到一个MySQL数据库,并执行插入、更新或删除操作。 总之,Spark SQL提供了SQL语言的强大功能,并与Spark的分布式计算能力相结合,可以处理大规模的结构化和半结构化数据。通过这些例子,我们可以看到Spark SQL在数据处理和分析方面的灵活性和强大性。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值