LIBNAMEのJSONエンジンの欠点は、32KBを超える長さのデータを抽出できない点です。できないものは仕方ないので、32KBを超えてもSASプログラムを抽出するコードを書きました。
字句解析の状態遷移を考えて "code": の値を抽出してバックスラッシュのエンコードを解いてファイルの保存します。入力のJSONファイルは整形されていないコンパクトや書式の前提です。昔、Lex, Yaccを使っていた記憶を掘り起こした。
- /* Code extraction from JSON file of job definition */
- %macro extractJobCode(infile=, outfile=, debug=0);
- %local/readonly DQ='22'x;
- %local/readonly BSL='5c'x;
- %local/readonly BS='08'x;
- %local/readonly FF='0c'x;
- %local/readonly NL='0a'x;
- %local/readonly CR='0d'x;
- %local/readonly TB='09'x;
- %local/readonly SP='20'x;
-
- data _NULL_;
- attrib filein length=8 label='input file id';
- attrib fileid length=8 label='output file id';
- attrib stat length=8 label='State of lexical analysis';
- attrib token length=$256 label='Token';
- attrib ucode length=$4 label='Unicode';
- attrib count length=8;
-
- /* Open input/output files */
- filein=fopen("&infile" , 'I', 1, 'B');
- fileid=fopen("&outfile", 'O', 1, 'B');
- count=0;
- stat=0;
- rec=&SP;
-
- do while(fread(filein)=0);
- rc=fget(filein, rec, 1);
-
- %if &debug=1 %then
- %do;
-
- if count < 512 then
- put stat=rec=token=;
- count=count + 1;
- %end;
-
- if stat eq 0 and rec eq &DQ then
- stat=1;
- else if stat eq 1 then
- do;
-
- if rec eq &DQ then
- do;
-
- /* Double quotes */
- if token eq 'code' then
- stat=2;
- else
- stat=0;
- token='';
- end;
- else
- do;
- token=catt(token, rec);
- stat=1;
- end;
- end;
- else if stat eq 2 then
- do;
-
- if rec in (&SP, &TB, &NL) then
- stat=2;
- else if rec eq ':' then
- stat=3;
- else
- stat=0;
- end;
- else if stat eq 3 then
- do;
-
- if rec in (&SP, &TB, &NL) then
- stat=3;
- else if rec eq &DQ then
- stat=4;
- else
- stat=0;
- end;
- else if stat eq 4 then
- do;
-
- if rec eq &DQ then
- stat=0;
- else if rec eq &BSL then
- stat=5;
- else
- do;
- rc=fput(fileid, rec);
- rc=fwrite(fileid);
- end;
- end;
- else if stat eq 5 then
- do;
-
- /* Handling of backslash-escaped characters */
- if rec eq 'b' then
- do;
- rc=fput(fileid, &BS);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq &DQ then
- do;
- rc=fput(fileid, &DQ);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq &BSL then
- do;
- rc=fput(fileid, &BSL);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq 'f' then
- do;
- rc=fput(fileid, &FF);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq 'n' then
- do;
- rc=fput(fileid, &NL);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq 'r' then
- do;
- rc=fput(fileid, &CR);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq 't' then
- do;
- rc=fput(fileid, &TB);
- rc=fwrite(fileid);
- stat=4;
- end;
- else if rec eq 'u' then
- do;
- ucode='';
- do i=1 to 4;
- rc=fread(filein);
- rc=fget(filein, rec, 1);
- ucode=catt(ucode, rec);
- end;
- if ucode eq '0026' then
- do;
- rc=fput(fileid, '&');
- rc=fwrite(fileid);
- end;
- else if ucode eq '003c' then
- do;
- rc=fput(fileid, '<');
- rc=fwrite(fileid);
- end;
- else if ucode eq '003e' then
- do;
- rc=fput(fileid, '>');
- rc=fwrite(fileid);
- end;
- else
- do;
- rc=fput(fileid, &BSL);
- rc=fput(fileid, 'u');
- rc=fput(fileid, ucode);
- rc=fwrite(fileid);
- end;
- stat=4;
- end;
- end;
- end;
-
- /* Close input/output files */
- rc=fclose(filein);
- rc=fclose(fileid);
- run;
-
- %mend;
/* Code extraction from JSON file of job definition */
%macro extractJobCode(infile=, outfile=, debug=0);
%local/readonly DQ='22'x;
%local/readonly BSL='5c'x;
%local/readonly BS='08'x;
%local/readonly FF='0c'x;
%local/readonly NL='0a'x;
%local/readonly CR='0d'x;
%local/readonly TB='09'x;
%local/readonly SP='20'x;
data _NULL_;
attrib filein length=8 label='input file id';
attrib fileid length=8 label='output file id';
attrib stat length=8 label='State of lexical analysis';
attrib token length=$256 label='Token';
attrib ucode length=$4 label='Unicode';
attrib count length=8;
/* Open input/output files */
filein=fopen("&infile" , 'I', 1, 'B');
fileid=fopen("&outfile", 'O', 1, 'B');
count=0;
stat=0;
rec=&SP;
do while(fread(filein)=0);
rc=fget(filein, rec, 1);
%if &debug=1 %then
%do;
if count < 512 then
put stat=rec=token=;
count=count + 1;
%end;
if stat eq 0 and rec eq &DQ then
stat=1;
else if stat eq 1 then
do;
if rec eq &DQ then
do;
/* Double quotes */
if token eq 'code' then
stat=2;
else
stat=0;
token='';
end;
else
do;
token=catt(token, rec);
stat=1;
end;
end;
else if stat eq 2 then
do;
if rec in (&SP, &TB, &NL) then
stat=2;
else if rec eq ':' then
stat=3;
else
stat=0;
end;
else if stat eq 3 then
do;
if rec in (&SP, &TB, &NL) then
stat=3;
else if rec eq &DQ then
stat=4;
else
stat=0;
end;
else if stat eq 4 then
do;
if rec eq &DQ then
stat=0;
else if rec eq &BSL then
stat=5;
else
do;
rc=fput(fileid, rec);
rc=fwrite(fileid);
end;
end;
else if stat eq 5 then
do;
/* Handling of backslash-escaped characters */
if rec eq 'b' then
do;
rc=fput(fileid, &BS);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq &DQ then
do;
rc=fput(fileid, &DQ);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq &BSL then
do;
rc=fput(fileid, &BSL);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq 'f' then
do;
rc=fput(fileid, &FF);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq 'n' then
do;
rc=fput(fileid, &NL);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq 'r' then
do;
rc=fput(fileid, &CR);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq 't' then
do;
rc=fput(fileid, &TB);
rc=fwrite(fileid);
stat=4;
end;
else if rec eq 'u' then
do;
ucode='';
do i=1 to 4;
rc=fread(filein);
rc=fget(filein, rec, 1);
ucode=catt(ucode, rec);
end;
if ucode eq '0026' then
do;
rc=fput(fileid, '&');
rc=fwrite(fileid);
end;
else if ucode eq '003c' then
do;
rc=fput(fileid, '<');
rc=fwrite(fileid);
end;
else if ucode eq '003e' then
do;
rc=fput(fileid, '>');
rc=fwrite(fileid);
end;
else
do;
rc=fput(fileid, &BSL);
rc=fput(fileid, 'u');
rc=fput(fileid, ucode);
rc=fwrite(fileid);
end;
stat=4;
end;
end;
end;
/* Close input/output files */
rc=fclose(filein);
rc=fclose(fileid);
run;
%mend;