/* * web access log 解析用補助プログラム * URLデコード処理 + 漢字コード変換処理 * written by H.Tsujimura 14 Oct 2003 * last update: 04 Feb 2004 by H.Tsujimura * * See also: * http://www.na.rim.or.jp/~tsupo/program/reverse/decodeLog.html * * Copyright (C) 2003, 2004 H.Tsujimura * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include "iconv.h" #define BUFFER_SIZE 2048 /* Unicode(UCS-2BE) (%uXXXX) → UTF-8 (%xx%yy%zz) 変換 */ char * convUnicode( unsigned long code ) { unsigned short a, b, c, d; static char codeStr[16]; if ( code < 0x0080 ) { a = code; sprintf( codeStr, "%%%02x", (a & 0xFF) ); } else if ( code < 0x0800 ) { a = ((code & 0x07C0) >> 6) | 0xC0; b = ( code & 0x003F) | 0x80; sprintf( codeStr, "%%%02x%%%02x", (a & 0xFF), (b & 0xFF) ); } else if ( code < 0x010000 ) { a = ((code & 0xF000) >> 12) | 0xE0; b = ((code & 0x0FC0) >> 6) | 0x80; c = ( code & 0x003F) | 0x80; sprintf( codeStr, "%%%02x%%%02x%%%02x", (a & 0xFF), (b & 0xFF), (c & 0xFF) ); } else { a = ((code & 0x001C0000) >> 18) | 0xF0; b = ((code & 0x0003F000) >> 12) | 0x80; c = ((code & 0x00000FC0) >> 6) | 0x80; d = ( code & 0x0000003F) | 0x80; sprintf( codeStr, "%%%02x%%%02x%%%02x%%%02x", (a & 0xFF), (b & 0xFF), (c & 0xFF), (d & 0xFF) ); } return ( codeStr ); } /* * URLエンコードされた文字列のデコード処理 * (%hh形式および&#xhhの16進エンコード、 * &#ddd形式の10進エンコードに対応) */ char * decodeURL( char *str ) { char buf[BUFFER_SIZE * 2], tmp[3]; static unsigned char out[BUFFER_SIZE * 2]; char *p; unsigned char *q; strcpy( buf, str ); memset( out, 0x00, BUFFER_SIZE * 2 ); p = &buf[0]; q = &out[0]; while ( *p ) { if ( *p == '%' ) { if ( (((*(p + 1) >= '0') && (*(p + 1) <= '9')) || ((*(p + 1) >= 'A') && (*(p + 1) <= 'F')) || ((*(p + 1) >= 'a') && (*(p + 1) <= 'f')) ) && (((*(p + 2) >= '0') && (*(p + 2) <= '9')) || ((*(p + 2) >= 'A') && (*(p + 2) <= 'F')) || ((*(p + 2) >= 'a') && (*(p + 2) <= 'f')) ) ) { tmp[0] = *++p; tmp[1] = *++p; tmp[2] = '\0'; p++; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); continue; } else if ( *(p + 1) == 'u' ) { /* unicode の場合 (4桁の16進) */ unsigned long code; char *r; p++; tmp[0] = *++p; tmp[1] = *++p; tmp[2] = '\0'; code = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); code <<= 8; tmp[0] = *++p; tmp[1] = *++p; tmp[2] = '\0'; code |= (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); r = convUnicode( code ); if ( *r ) { tmp[0] = *++r; tmp[1] = *++r; tmp[2] = '\0'; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); ++r; tmp[0] = *++r; tmp[1] = *++r; tmp[2] = '\0'; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); ++r; tmp[0] = *++r; tmp[1] = *++r; tmp[2] = '\0'; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); } p++; continue; } } else if ( (*p == '&') && (*(p + 1) == '#') ) { if ( *(p + 2) == 'x' ) { /* 16進 */ p += 3; if ( (((*p >= '0') && (*p <= '9')) || ((*p >= 'A') && (*p <= 'F')) || ((*p >= 'a') && (*p <= 'f')) ) && (((*(p + 1) >= '0') && (*(p + 1) <= '9')) || ((*(p + 1) >= 'A') && (*(p + 1) <= 'F')) || ((*(p + 1) >= 'a') && (*(p + 1) <= 'f')) ) && (*(p + 2) == ';') ) { tmp[0] = *p++; tmp[1] = *p++; tmp[2] = '\0'; p++; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); continue; } p -= 3; } else { /* 10進 */ p += 2; if ( ((*p >= '1') && (*p <= '9')) && ((*(p + 1) >= '0') && (*(p + 1) <= '9')) && ( (*(p + 2) == ';') || ((*(p + 2) >= '0') && (*(p + 2) <= '9') && (*(p + 3) == ';')) ) ) { long l = atol( p ); while ( *p != ';' ) p++; p++; *q++ = (unsigned char)(l & 0xFF); continue; } else if ( ((*p >= '1') && (*p <= '9')) && ((*(p + 1) >= '0') && (*(p + 1) <= '9')) && ((*(p + 2) >= '0') && (*(p + 2) <= '9')) && ((*(p + 3) >= '0') && (*(p + 3) <= '9')) && ((*(p + 4) >= '0') && (*(p + 4) <= '9')) && (*(p + 5) == ';') ) { unsigned long l = (unsigned long)atol( p ); char *r; while ( *p != ';' ) p++; p++; r = convUnicode( l ); if ( *r ) { tmp[0] = *++r; tmp[1] = *++r; tmp[2] = '\0'; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); ++r; tmp[0] = *++r; tmp[1] = *++r; tmp[2] = '\0'; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); ++r; tmp[0] = *++r; tmp[1] = *++r; tmp[2] = '\0'; *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF); } continue; } p -= 2; } } *q++ = (unsigned char)(*p++); } *q = '\0'; return ( out ); } /* * 漢字コード変換処理 * GNU iconv ライブラリを利用 */ char * convertCode( const char *p, const char *inCode, const char *outCode ) { char inbuf[BUFFER_SIZE + 1]; static char outbuf[BUFFER_SIZE * 2 + 1]; int result; size_t inbufSiz = BUFFER_SIZE; size_t outbufSiz = BUFFER_SIZE * 2; iconv_t cd; char *inp; char *outp; memset( outbuf, 0x00, BUFFER_SIZE * 2 + 1 ); cd = iconv_open( outCode, inCode ); if ( cd == NULL ) return ( NULL ); strcpy( inbuf, p ); inp = inbuf; outp = outbuf; inbufSiz = strlen( inp ); outbufSiz = BUFFER_SIZE * 2; memset( outp, 0x00, outbufSiz ); result = iconv( cd, &inp, &inbufSiz, &outp, &outbufSiz ); iconv_close( cd ); if ( result < 0 ) return ( NULL ); return ( outbuf ); } char * sjis2sjis( const char *p ) { return ( convertCode( p, "SHIFT_JIS", "CP932" ) ); } char * utf2sjis( const char *p ) { char *q, *r; unsigned char aaa[4]; int flag = 0; /* "〜" (UTF-8 で 0xEF 0xBD 0x9E) を含む文字列の場合、 */ /* iconv() が UTF-8 → CP932 変換に失敗することが判明。*/ /* 対策を組み込む 2003.10.15 */ aaa[0] = 0xEF; aaa[1] = 0xBD; aaa[2] = 0x9E; aaa[3] = 0x00; if ( ( q = strstr( p, aaa ) ) != NULL ) { *q++ = '_'; *q++ = '_'; *q++ = '_'; flag = 1; } q = convertCode( p, "UTF-8", "CP932" ); if ( q && flag ) { r = strstr( q, "___" ); while ( r ) { *r = '\0'; strcat( q, "〜" ); strcat( q, r + 3 ); r = strstr( q, "___" ); } } else if ( flag ) { /* "。〜" 等のパターンの場合、いったん変換に失敗するが、 */ /* "〜" → "___" に置換後、再度変換を試みると成功する */ return ( utf2sjis( p ) ); /* 再度変換 */ } return ( q ); } char * euc2sjis( const char *p ) { #if 0 /* 必要なら、以下のコードを有効にする */ char *q; unsigned char aaa[4]; /* "〜" (UTF-8 で 0xEF 0xBD 0x9E) を含む文字列の場合、 */ /* EUC-JP ではなく、UTF-8 とみなしてデコード [暫定] */ aaa[0] = 0xEF; aaa[1] = 0xBD; aaa[2] = 0x9E; aaa[3] = 0x00; if ( ( q = strstr( p, aaa ) ) != NULL ) return ( utf2sjis( p ) ); #endif /* 機種依存文字対応(とりあえず、ローマ数字の3(III)に対応 */ char *q, *r; unsigned char aaa[3]; aaa[0] = 0xAD; aaa[1] = 0xB7; aaa[2] = '\0'; if ( ( q = strstr( p, aaa ) ) != NULL ) { *q++ = '~'; *q++ = '_'; q = convertCode( p, "EUC-JP", "CP932" ); if ( q && ( ( r = strchr( q, '~' ) ) != NULL ) && (*(r + 1) == '_') ) { *r++ = (char)0x87; *r++ = (char)0x56; return ( q ); } } return ( convertCode( p, "EUC-JP", "CP932" ) ); } char * jis2sjis( const char *p ) { return ( convertCode( p, "ISO-2022-JP", "CP932" ) ); } #define UTF2SJIS( src, dst ) { \ (dst) = utf2sjis( src ); \ if ( !(dst) ) \ (dst) = euc2sjis( src ); \ if ( !(dst) || strstr( (dst), "(B" ) ) \ (dst) = jis2sjis( src ); \ if ( !(dst) ) \ (dst) = (src); \ } #define EUC2SJIS( src, dst ) { \ (dst) = euc2sjis( src ); \ if ( !(dst) ) \ (dst) = utf2sjis( src ); \ if ( !(dst) || strstr( (dst), "(B" ) ) \ (dst) = jis2sjis( src ); \ if ( !(dst) ) \ (dst) = (src); \ } /* * 変換処理本体 */ void _decodeLog( char *buf ) { char *p = buf, *q, *r, *s, *t; int utfEuc = 0; /* 先に UTF-8, EUC-JP の判定(ができる場合は)を実行 */ /* (誤変換の確率を減らすため) */ if ( ( ( q = strstr( p, "=UTF-8" ) ) != NULL ) || ( ( q = strstr( p, "=UTF8" ) ) != NULL ) || ( ( q = strstr( p, "=utf-8" ) ) != NULL ) || ( ( q = strstr( p, "=utf8" ) ) != NULL ) ) utfEuc = 1; /* UTF-8 */ else if ( ( ( q = strstr( p, "=EUC-JP" ) ) != NULL ) || ( ( q = strstr( p, "=euc-jp" ) ) != NULL ) ) utfEuc = 2; /* EUC-JP */ if ( (strchr( p, '%' ) != NULL) || (strstr( p, "&#" ) != NULL) ) q = decodeURL( p ); else q = p; if ( utfEuc == 1 ) { UTF2SJIS( q, r ); } else if ( utfEuc == 2 ) { EUC2SJIS( q, r ); } else if ( ( r = strchr( q, '?' ) ) != NULL ) { char *pp; if ( ((s = strstr( q, "client=nifty&ie=euc-jp&oe=utf-8" ) ) != NULL) && ((s = strstr( s, "&funcno=1&" )) != NULL ) ) { char tmp[BUFFER_SIZE]; strcpy( tmp, s ); *s = '\0'; strcpy( buf, r ); q = buf; EUC2SJIS( q, r ); strcat( r, tmp ); } else if ( (strstr( q, "search.yahoo.co.jp" ) || strstr( q, "search.nifty.com" ) || strstr( q, "search.goo.ne.jp" ) ) && ((pp = strchr( q, 0xAD )) != NULL) && (*(pp + 1) == 0xEA) ) { /* 誤変換対策 (0xAD 0xEA の並びの文字列があるとき) */ /* (Yahoo! Japan 特有の対策) */ *pp++ = '?'; *pp++ = 0x20; EUC2SJIS( q, r ); } else { UTF2SJIS( q, r ); } } else r = q; if ( ( ( s = strstr( r, "jpsearch.naver.com" ) ) != NULL ) && ( ( s = strstr( r, "&query" ) ) != NULL ) ) { /* jpsearch.naver.com は韓国の検索エンジン (EUC-KR を使用) */ unsigned char *pp = q; /* EUC-KR → EUC-JP 変換(可能な文字のみ) */ for ( ; pp && *pp; pp++ ) { if ( *pp == 0xAA ) { /* ひらがな */ *pp = 0xA4; continue; } if ( *pp == 0xAB ) { /* カタカナ */ *pp = 0xA5; continue; } if ( (*pp == 0xEE) && (*(pp+1) == 0xE5) ) { /* 赤 */ *pp++ = 0xC0; *pp = 0xD6; continue; } if ( (*pp == 0xDC) && (*(pp+1) == 0xD2) ) { /* 僕 */ *pp++ = 0xCB; *pp = 0xCD; continue; } } EUC2SJIS( q, r ); } if ( ( ( s = strstr( r, "infoseek" ) ) != NULL) && ( ( ( s = strstr( r, "=検" ) ) != NULL) || ( ( s = strstr( r, "=新" ) ) != NULL) ) ) { /* 同一行に複数の文字コード体系が混在している場合 */ /* (infoseek 特有の対策) */ char tmp[/* BUFSIZ */ BUFFER_SIZE]; strcpy( tmp, s ); *s = '\0'; strcpy( buf, r ); q = buf; EUC2SJIS( q, r ); strcat( r, tmp ); if ( ( s = strstr( r, "検鷺" ) ) != NULL ) { #if 0 strcpy( s, "検索\n" ); #else /* ↓ "検索" 以降の文字列も可能な限り救うことにする 2003.10.16 */ strncpy( s, "検索", 4 ); if ( t = strstr( r, "input+type" ) ) { if ( t > s ) { *(s + 4) = '&'; strcpy( s + 5, t ); } } #endif /* ↑ "検索" 以降の文字列も可能な限り救うことにする 2003.10.16 */ } } fputs( r, stdout ); } /* * 変換処理本体 */ void decodeLog( FILE *fp ) { char buf[BUFFER_SIZE]; char tmp[BUFFER_SIZE]; char *p, *q, *r, *s; while ( ( p = fgets( buf, BUFFER_SIZE - 1, fp ) ) != NULL ) { if ( ( q = strstr( p, " -> " ) ) != NULL ) { /* reverse.cgi の出力するアクセス解析用ログの場合 */ /* see http://www.na.rim.or.jp/~tsupo/program/reverse/reverse2.html */ *q++ = '\0'; _decodeLog( p ); putchar( ' ' ); if ( ( r = strstr( q, " => " ) ) != NULL ) { *r++ = '\0'; _decodeLog( q ); putchar( ' ' ); s = utf2sjis( decodeURL( r ) ); if ( s && *s ) fputs( s, stdout ); else { s = euc2sjis( r ); if ( s && *s ) fputs( s, stdout ); else fputs( r, stdout ); } } else _decodeLog( q ); } else { /* 従来のアクセス解析用ログの場合 */ /* see http://www.na.rim.or.jp/~tsupo/program/reverse/ */ _decodeLog( p ); } } } int main( int argc, char *argv[] ) { FILE *fp; if ( argc > 1 ) { int i; for ( i = 1; i < argc; i++ ) { if ( ( fp = fopen( argv[i], "r" ) ) != NULL ) { decodeLog( fp ); fclose( fp ); } } } else decodeLog( stdin ); return ( 1 ); }