/*
 *  web access log 解析用補助プログラム
 *     URLデコード処理 + 漢字コード変換処理
 *     written by H.Tsujimura  14 Oct 2003
 *     last update: 04 Feb 2004 by H.Tsujimura
 *
 *    See also:
 *     http://www.na.rim.or.jp/~tsupo/program/reverse/decodeLog.html
 *
 *  Copyright (C) 2003, 2004 H.Tsujimura
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version 2
 *  of the License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "iconv.h"


#define BUFFER_SIZE 2048

/* Unicode(UCS-2BE) (%uXXXX) → UTF-8 (%xx%yy%zz) 変換 */
char    *
convUnicode( unsigned long code )
{
    unsigned short a, b, c, d;
    static char    codeStr[16];

    if ( code < 0x0080 ) {
	a = code;
	sprintf( codeStr, "%%%02x", (a & 0xFF) );
    }
    else if ( code < 0x0800 ) {
	a = ((code & 0x07C0) >> 6) | 0xC0;
	b = ( code & 0x003F)       | 0x80;
	sprintf( codeStr, "%%%02x%%%02x",
		 (a & 0xFF), (b & 0xFF) );
    }
    else if ( code < 0x010000 ) {
	a = ((code & 0xF000) >> 12) | 0xE0;
	b = ((code & 0x0FC0) >>  6) | 0x80;
	c = ( code & 0x003F)        | 0x80;
	sprintf( codeStr, "%%%02x%%%02x%%%02x",
		 (a & 0xFF), (b & 0xFF), (c & 0xFF) );
    }
    else {
	a = ((code & 0x001C0000) >> 18) | 0xF0;
	b = ((code & 0x0003F000) >> 12) | 0x80;
	c = ((code & 0x00000FC0) >>  6) | 0x80;
	d = ( code & 0x0000003F)        | 0x80;
	sprintf( codeStr, "%%%02x%%%02x%%%02x%%%02x",
		 (a & 0xFF), (b & 0xFF), (c & 0xFF), (d & 0xFF) );
    }

    return ( codeStr );
}


/*
 *  URLエンコードされた文字列のデコード処理
 *    (%hh形式および&#xhhの16進エンコード、
 *     &#ddd形式の10進エンコードに対応)
 */
char    *
decodeURL( char *str )
{
    char                  buf[BUFFER_SIZE * 2], tmp[3];
    static unsigned char  out[BUFFER_SIZE * 2];
    char                  *p;
    unsigned char         *q;

    strcpy( buf, str );
    memset( out, 0x00, BUFFER_SIZE * 2 );
    p = &buf[0];
    q = &out[0];
    while ( *p ) {
        if ( *p == '%' ) {
            if ( (((*(p + 1) >= '0') && (*(p + 1) <= '9')) ||
                  ((*(p + 1) >= 'A') && (*(p + 1) <= 'F')) ||
                  ((*(p + 1) >= 'a') && (*(p + 1) <= 'f'))   ) &&
                 (((*(p + 2) >= '0') && (*(p + 2) <= '9')) ||
                  ((*(p + 2) >= 'A') && (*(p + 2) <= 'F')) ||
                  ((*(p + 2) >= 'a') && (*(p + 2) <= 'f'))   )    ) {
                tmp[0] = *++p;
                tmp[1] = *++p;
                tmp[2] = '\0';
                p++;
                *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
                continue;
            }
	    else if ( *(p + 1) == 'u' ) {
		/* unicode の場合 (4桁の16進) */
		unsigned long   code;
		char            *r;

                p++;
                tmp[0] = *++p;
                tmp[1] = *++p;
                tmp[2] = '\0';
                code = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
		code <<= 8;
                tmp[0] = *++p;
                tmp[1] = *++p;
                tmp[2] = '\0';
                code |= (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);

	        r = convUnicode( code );
		if ( *r ) {
		    tmp[0] = *++r;
		    tmp[1] = *++r;
		    tmp[2] = '\0';
		    *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
		    ++r;
		    tmp[0] = *++r;
		    tmp[1] = *++r;
		    tmp[2] = '\0';
		    *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
		    ++r;
		    tmp[0] = *++r;
		    tmp[1] = *++r;
		    tmp[2] = '\0';
		    *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
		}
		p++;

                continue;
	    }
        }
        else if ( (*p == '&') && (*(p + 1) == '#') ) {
            if ( *(p + 2) == 'x' ) {
                /* 16進 */
                p += 3;
                if ( (((*p >= '0') && (*p <= '9')) ||
                      ((*p >= 'A') && (*p <= 'F')) ||
                      ((*p >= 'a') && (*p <= 'f'))   )             &&
                     (((*(p + 1) >= '0') && (*(p + 1) <= '9')) ||
                      ((*(p + 1) >= 'A') && (*(p + 1) <= 'F')) ||
                      ((*(p + 1) >= 'a') && (*(p + 1) <= 'f'))   ) &&
                       (*(p + 2) == ';')                              ) {
                    tmp[0] = *p++;
                    tmp[1] = *p++;
                    tmp[2] = '\0';
                    p++;
                    *q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
                    continue;
                }
                p -= 3;
            }
            else {
                /* 10進 */
                p += 2;
                if ( ((*p >= '1') && (*p <= '9'))                 &&
                     ((*(p + 1) >= '0') && (*(p + 1) <= '9'))     &&
                    ( (*(p + 2) == ';') ||
                     ((*(p + 2) >= '0') && (*(p + 2) <= '9') &&
                      (*(p + 3) == ';'))                        )     ) {
                    long l = atol( p );
                    while ( *p != ';' )
                        p++;
                    p++;
                    *q++ = (unsigned char)(l & 0xFF);
                    continue;
                }
		else
                if ( ((*p       >= '1') && (*p       <= '9')) &&
                     ((*(p + 1) >= '0') && (*(p + 1) <= '9')) &&
                     ((*(p + 2) >= '0') && (*(p + 2) <= '9')) &&
                     ((*(p + 3) >= '0') && (*(p + 3) <= '9')) &&
                     ((*(p + 4) >= '0') && (*(p + 4) <= '9')) &&
                      (*(p + 5) == ';')                          ) {
                    unsigned long   l = (unsigned long)atol( p );
		    char            *r;

                    while ( *p != ';' )
                        p++;
                    p++;
		    r = convUnicode( l );
		    if ( *r ) {
			tmp[0] = *++r;
			tmp[1] = *++r;
			tmp[2] = '\0';
			*q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
			++r;
			tmp[0] = *++r;
			tmp[1] = *++r;
			tmp[2] = '\0';
			*q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
			++r;
			tmp[0] = *++r;
			tmp[1] = *++r;
			tmp[2] = '\0';
			*q++ = (unsigned char)(strtol( tmp, NULL, 16 ) & 0xFF);
		    }
                    continue;
                }
                p -= 2;
            }
        }

        *q++ = (unsigned char)(*p++);
    }
    *q = '\0';

    return ( out );
}


/*
 *  漢字コード変換処理
 *    GNU iconv ライブラリを利用
 */
char    *
convertCode( const char *p, const char *inCode, const char *outCode )
{
    char        inbuf[BUFFER_SIZE + 1];
    static char outbuf[BUFFER_SIZE * 2 + 1];
    int         result;
    size_t      inbufSiz  = BUFFER_SIZE;
    size_t      outbufSiz = BUFFER_SIZE * 2;
    iconv_t     cd;
    char        *inp;
    char        *outp;

    memset( outbuf, 0x00, BUFFER_SIZE * 2 + 1 );
    cd = iconv_open( outCode, inCode );
    if ( cd == NULL )
        return ( NULL );

    strcpy( inbuf, p );
    inp       = inbuf;
    outp      = outbuf;
    inbufSiz  = strlen( inp );
    outbufSiz = BUFFER_SIZE * 2;
    memset( outp, 0x00, outbufSiz );
    result = iconv( cd,
                    &inp,  &inbufSiz,
                    &outp, &outbufSiz );

    iconv_close( cd );

    if ( result < 0 )
        return ( NULL );

    return ( outbuf );
}

char    *
sjis2sjis( const char *p )
{
    return ( convertCode( p, "SHIFT_JIS", "CP932" ) );
}

char    *
utf2sjis( const char *p )
{
    char            *q, *r;
    unsigned char   aaa[4];
    int             flag = 0;

    /* "～" (UTF-8 で 0xEF 0xBD 0x9E) を含む文字列の場合、 */
    /* iconv() が UTF-8 → CP932 変換に失敗することが判明。*/
    /* 対策を組み込む                           2003.10.15 */
    aaa[0] = 0xEF; aaa[1] = 0xBD; aaa[2] = 0x9E; aaa[3] = 0x00;
    if ( ( q = strstr( p, aaa ) ) != NULL ) {
        *q++ = '_';
        *q++ = '_';
        *q++ = '_';
        flag = 1;
    }

    q = convertCode( p, "UTF-8", "CP932" );
    if ( q && flag ) {
        r = strstr( q, "___" );
        while ( r ) {
            *r = '\0';
            strcat( q, "～" );
            strcat( q, r + 3 );
	    r = strstr( q, "___" );
        }
    }
    else if ( flag ) {
        /* "。～" 等のパターンの場合、いったん変換に失敗するが、 */
        /* "～" → "___" に置換後、再度変換を試みると成功する    */
        return ( utf2sjis( p ) ); /* 再度変換 */
    }

    return ( q );
}

char    *
euc2sjis( const char *p )
{
#if 0 /* 必要なら、以下のコードを有効にする */
    char            *q;
    unsigned char   aaa[4];

    /* "～" (UTF-8 で 0xEF 0xBD 0x9E) を含む文字列の場合、 */
    /* EUC-JP ではなく、UTF-8 とみなしてデコード [暫定]    */
    aaa[0] = 0xEF; aaa[1] = 0xBD; aaa[2] = 0x9E; aaa[3] = 0x00;
    if ( ( q = strstr( p, aaa ) ) != NULL )
        return ( utf2sjis( p ) );
#endif
    /* 機種依存文字対応(とりあえず、ローマ数字の3(III)に対応 */
    char            *q, *r;
    unsigned char   aaa[3];
    aaa[0] = 0xAD; aaa[1] = 0xB7; aaa[2] = '\0';
    if ( ( q = strstr( p, aaa ) ) != NULL ) {
        *q++ = '~';
        *q++ = '_';
        q = convertCode( p, "EUC-JP", "CP932" );
        if ( q && ( ( r = strchr( q, '~' ) ) != NULL ) && (*(r + 1) == '_') ) {
            *r++ = (char)0x87;
            *r++ = (char)0x56;
            return ( q );
        }
    }

    return ( convertCode( p, "EUC-JP", "CP932" ) );
}

char    *
jis2sjis( const char *p )
{
    return ( convertCode( p, "ISO-2022-JP", "CP932" ) );
}


#define UTF2SJIS( src, dst )    { \
    (dst) = utf2sjis( src ); \
    if ( !(dst) ) \
        (dst) = euc2sjis( src ); \
    if ( !(dst) || strstr( (dst), "(B" ) ) \
        (dst) = jis2sjis( src ); \
    if ( !(dst) ) \
        (dst) = (src); \
}

#define EUC2SJIS( src, dst )    { \
    (dst) = euc2sjis( src ); \
    if ( !(dst) ) \
        (dst) = utf2sjis( src ); \
    if ( !(dst) || strstr( (dst), "(B" ) ) \
        (dst) = jis2sjis( src ); \
    if ( !(dst) ) \
        (dst) = (src); \
}


/*
 *  変換処理本体
 */
void
_decodeLog( char *buf )
{
    char    *p = buf, *q, *r, *s, *t;
    int     utfEuc = 0;

    /* 先に UTF-8, EUC-JP の判定(ができる場合は)を実行 */
    /*   (誤変換の確率を減らすため)                    */
    if ( ( ( q = strstr( p, "=UTF-8" ) ) != NULL ) ||
	 ( ( q = strstr( p, "=UTF8"  ) ) != NULL ) ||
	 ( ( q = strstr( p, "=utf-8" ) ) != NULL ) ||
	 ( ( q = strstr( p, "=utf8"  ) ) != NULL )    )
	utfEuc = 1; /* UTF-8 */
    else
        if ( ( ( q = strstr( p, "=EUC-JP" ) ) != NULL ) ||
	     ( ( q = strstr( p, "=euc-jp" ) ) != NULL )    )
            utfEuc = 2; /* EUC-JP */

    if ( (strchr( p, '%' )  != NULL) ||
	 (strstr( p, "&#" ) != NULL)    )
	q = decodeURL( p );
    else
	q = p;

    if ( utfEuc == 1 ) {
	UTF2SJIS( q, r );
    }
    else if ( utfEuc == 2 ) {
	EUC2SJIS( q, r );
    }
    else if ( ( r = strchr( q, '?' ) ) != NULL ) {
	char *pp;

	if ( ((s = strstr( q, "client=nifty&ie=euc-jp&oe=utf-8" ) ) != NULL) &&
	     ((s = strstr( s, "&funcno=1&" )) != NULL )                         ) {
	    char    tmp[BUFFER_SIZE];

	    strcpy( tmp, s );
	    *s = '\0';
	    strcpy( buf, r );
	    q = buf;
	    EUC2SJIS( q, r );
	    strcat( r, tmp );
	}
	else
	if ( (strstr( q, "search.yahoo.co.jp" ) ||
	      strstr( q, "search.nifty.com"   ) ||
	      strstr( q, "search.goo.ne.jp"   )    ) &&
	     ((pp = strchr( q, 0xAD )) != NULL)      &&
	     (*(pp + 1) == 0xEA)                        ) {
	    /* 誤変換対策 (0xAD 0xEA の並びの文字列があるとき) */
	    /*   (Yahoo! Japan 特有の対策)                     */
	    *pp++ = '?';
	    *pp++ = 0x20;
	    EUC2SJIS( q, r );
	}
	else {
	    UTF2SJIS( q, r );
	}
    }
    else
	r = q;

    if ( ( ( s = strstr( r, "jpsearch.naver.com" ) ) != NULL ) &&
	 ( ( s = strstr( r, "&query"             ) ) != NULL )    ) {
	/* jpsearch.naver.com は韓国の検索エンジン (EUC-KR を使用) */
	unsigned char *pp = q;
	/* EUC-KR → EUC-JP 変換(可能な文字のみ) */
	for ( ; pp && *pp; pp++ ) {
	    if ( *pp == 0xAA ) {
		/* ひらがな */
		*pp = 0xA4;
		continue;
	    }
	    if ( *pp == 0xAB ) {
		/* カタカナ */
		*pp = 0xA5;
		continue;
	    }
	    if ( (*pp == 0xEE) && (*(pp+1) == 0xE5) ) {
		/* 赤 */
		*pp++ = 0xC0;
		*pp   = 0xD6;
		continue;
	    }
	    if ( (*pp == 0xDC) && (*(pp+1) == 0xD2) ) {
		/* 僕 */
		*pp++ = 0xCB;
		*pp   = 0xCD;
		continue;
	    }
	}

	EUC2SJIS( q, r );
    }

    if ( ( ( s = strstr( r, "infoseek" ) ) != NULL)       &&
	 ( ( ( s = strstr( r, "=検" )     ) != NULL) ||
	   ( ( s = strstr( r, "=新" )     ) != NULL)    )     ) {
	/* 同一行に複数の文字コード体系が混在している場合 */
	/*   (infoseek 特有の対策)                        */
	char    tmp[/* BUFSIZ */ BUFFER_SIZE];
	strcpy( tmp, s );
	*s = '\0';
	strcpy( buf, r );
	q = buf;
	EUC2SJIS( q, r );
	strcat( r, tmp );
	if ( ( s = strstr( r, "検鷺" ) ) != NULL ) {
#if 0
	    strcpy( s, "検索\n" );
#else  /* ↓ "検索" 以降の文字列も可能な限り救うことにする 2003.10.16 */
	    strncpy( s, "検索", 4 );
	    if ( t = strstr( r, "input+type" ) ) {
		if ( t > s ) {
		    *(s + 4) = '&';
		    strcpy( s + 5, t );
		}
	    }
#endif  /* ↑ "検索" 以降の文字列も可能な限り救うことにする 2003.10.16 */
	}
    }

    fputs( r, stdout );
}

/*
 *  変換処理本体
 */
void
decodeLog( FILE *fp )
{
    char    buf[BUFFER_SIZE];
    char    tmp[BUFFER_SIZE];
    char    *p, *q, *r, *s;

    while ( ( p = fgets( buf, BUFFER_SIZE - 1, fp ) ) != NULL ) {
	if ( ( q = strstr( p, " -> " ) ) != NULL ) {
	    /* reverse.cgi の出力するアクセス解析用ログの場合 */
	    /* see http://www.na.rim.or.jp/~tsupo/program/reverse/reverse2.html */
	    *q++ = '\0';
	    _decodeLog( p );
	    putchar( ' ' );
	    if ( ( r = strstr( q, " => " ) ) != NULL ) {
		*r++ = '\0';
		_decodeLog( q );
		putchar( ' ' );
		s = utf2sjis( decodeURL( r ) );
		if ( s && *s )
		    fputs( s, stdout );
		else {
		    s = euc2sjis( r );
		    if ( s && *s )
			fputs( s, stdout );
		    else
			fputs( r, stdout );
		}
	    }
	    else
		_decodeLog( q );
	}
	else {
	    /* 従来のアクセス解析用ログの場合 */
	    /* see http://www.na.rim.or.jp/~tsupo/program/reverse/ */
	    _decodeLog( p );
	}
    }
}


int
main( int argc, char *argv[] )
{
    FILE    *fp;

    if ( argc > 1 ) {
        int i;
        for ( i = 1; i < argc; i++ ) {
            if ( ( fp = fopen( argv[i], "r" ) ) != NULL ) {
                decodeLog( fp );
                fclose( fp );
            }
        }
    }
    else
        decodeLog( stdin );

    return ( 1 );
}