001:
002:
003:
004:
005:
006:
007:
008:
009:
010:
011: #include <stdio.h>
012: #include <stdlib.h>
013: #include <string.h>
014: #include <memory.h>
015: #include <ctype.h>
016: #include "html2txt.h"
017:
018:
019:
020:
021:
022: #ifndef USE_LIB
023: int main(int argc,char* argv[])
024: {
025: CHtml2Txt* h2t = new CHtml2Txt(argc,argv);
026: if(h2t->ReadCode()){
027: if(h2t->Convert())
028: h2t->WriteCode();
029: }
030: delete h2t;
031: return 0;
032: }
033: #endif
034:
035:
036:
037:
038:
039:
040:
041:
042:
043:
044: void CHtml2Txt::usage()
045: {
046: fprintf(stderr,"USAGE: %s htmlfile [-disp] [-link] [-out textfile] [-url baseurl]\n",
047: cmdopt.cmd);
048: fprintf(stderr,"option:\n");
049: fprintf(stderr,"\t-disp \toutput to display (default .txt file).\n");
050: fprintf(stderr,"\t-link \toutput link[ref url and image]\n");
051: fprintf(stderr,"\t-out htmlfile\toutput filename\n");
052: fprintf(stderr,"\t-url \tset base URL\n");
053: }
054:
055:
056:
057: int CHtml2Txt::Read_arg(int argc,char* argv[],CHtmlOption& cmdopt)
058: {
059:
060: strcpy(cmdopt.cmd,argv[0]);
061: if(argc==1) { return FALSE; }
062: for(int i=1;i<argc;i++) {
063: if(argv[i][0] == '-'){
064: if(!strcmp(argv[i], "-out")) {
065: if(++i>=argc) return FALSE;
066: strcpy(cmdopt.fOut,argv[i]);
067: } else if(!strcmp(argv[i], "-url")) {
068: if(++i>=argc) return FALSE;
069: SetURL(argv[i]);
070: } else if(!strcmp(argv[i], "-disp")) {
071: cmdopt.Hdst = DST_DISP;
072: } else if(!strcmp(argv[i], "-link")) {
073: cmdopt.link = SW_LINK;
074: } else {
075: fprintf(stderr,"unregonized option '%s'\n",argv[i]);
076: return FALSE;
077: }
078: } else {
079: strcpy(cmdopt.fIn,argv[i]);
080: }
081: }
082: return TRUE;
083: }
084:
085:
086:
087:
088: CHtml2Txt::CHtml2Txt(int argc,char* argv[])
089: {
090: Run = Read_arg(argc,argv,cmdopt);
091: if(Run == FALSE){
092: usage(); return;
093: }
094: InitClass();
095: }
096:
097:
098:
099:
100: CHtml2Txt::CHtml2Txt(CHtmlOption opt)
101: {
102: Run = TRUE;
103: cmdopt = opt;
104: InitClass();
105: }
106:
107:
108:
109:
110: CHtml2Txt::CHtml2Txt()
111: {
112: Run = TRUE;
113: InitClass();
114: }
115:
116: CHtml2Txt::~CHtml2Txt()
117: {
118: if(Run){
119: if(Hcode!=NULL) free(Hcode);
120: if(Tcode!=NULL) free(Tcode);
121: }
122: }
123:
124:
125:
126: void CHtml2Txt::InitClass()
127: {
128:
129: char dir[128],file[128],ext[10];
130: if(strcmp(cmdopt.fIn,std_in)){
131: split_path(cmdopt.fIn,dir,file,ext);
132: sprintf(DocTitle,"%s%s",file,ext);
133: if(!strcmp(cmdopt.fOut,std_out)){
134: if(cmdopt.Hdst == DST_FILE){
135: sprintf(cmdopt.fOut,"%s.txt",file);
136: }
137: }
138: } else {
139: sprintf(DocTitle,"html2txt");
140: }
141:
142: Hcode = Tcode = NULL;
143: OFP = NULL;
144: }
145:
146:
147:
148: void CHtml2Txt::FreeCode()
149: {
150: if(Run){
151: if(Hcode!=NULL) free(Hcode), Hcode = NULL;
152: if(Tcode!=NULL) free(Tcode), Tcode = NULL;
153: }
154: }
155:
156:
157:
158: void CHtml2Txt::SetURL(char* baseurl)
159: {
160: int ui,len=strlen(baseurl);
161: memset(BaseURL,0,sizeof(char)*MAX_PATH);
162: memset(BaseFile,0,sizeof(char)*MAX_PATH);
163: for(ui=len;ui>=0;ui--){
164: if(baseurl[ui]=='/') break;
165: }
166: if(ui>=7){
167: strncpy(BaseURL,baseurl,sizeof(char)*ui);
168: if(ui!=len)
169: strncpy(BaseFile,baseurl+ui+1,len-ui-1);
170: } else {
171: strcpy(BaseURL,baseurl);
172: }
173: cmdopt.url = BASE_URL;
174: }
175:
176:
177:
178: void CHtml2Txt::SetCode(char* src)
179: {
180:
181: lineSum = tabSum = fcLen = 0;
182: str_count(src,fcLen,lineSum,tabSum);
183:
184: Hcode = (char*)calloc(fcLen+1,sizeof(char));
185: strcpy(Hcode,src);
186: }
187:
188:
189:
190: int CHtml2Txt::ReadCode()
191: {
192: if(Run==FALSE) return FALSE;
193:
194: lineSum = tabSum = fcLen = 0;
195: Fstr_count(cmdopt.fIn,fcLen,lineSum,tabSum);
196: if(fcLen == 0){
197: return FALSE;
198: }
199:
200: Hcode = (char*)calloc(fcLen+1,sizeof(char));
201:
202: if(!read_string(cmdopt.fIn,fcLen,Hcode)){
203: fprintf(stderr,"input file reading error.\n");
204: return FALSE;
205: }
206: return TRUE;
207: }
208:
209:
210:
211: int CHtml2Txt::ReadFile(char* fname)
212: {
213: if(Run==FALSE) return FALSE;
214:
215: strcpy(cmdopt.fIn,fname);
216:
217: char dir[128],file[128],ext[10];
218: split_path(cmdopt.fIn,dir,file,ext);
219: sprintf(DocTitle,"%s%s",file,ext);
220: sprintf(cmdopt.fOut,"%s.txt",file);
221:
222: lineSum = tabSum = fcLen = 0;
223: Fstr_count(cmdopt.fIn,fcLen,lineSum,tabSum);
224: if(fcLen == 0){
225: return FALSE;
226: }
227:
228: Hcode = (char*)calloc(fcLen+1,sizeof(char));
229:
230: if(!read_string(cmdopt.fIn,fcLen,Hcode)){
231: fprintf(stderr,"input file reading error.\n");
232: return FALSE;
233: }
234: return TRUE;
235: }
236:
237:
238:
239: int CHtml2Txt::Convert()
240: {
241: if(Run==FALSE) return FALSE;
242:
243:
244:
245: int buflen = FMINSIZ+fcLen;
246: if(!Initbuf(buflen)){
247: fprintf(stderr,"cannot alloc output buffer.\n");
248: return FALSE;
249: }
250:
251: html2txt();
252: return TRUE;
253: }
254:
255:
256:
257: int CHtml2Txt::CodeLen()
258: {
259: return strlen(Tcode);
260: }
261:
262:
263:
264: void CHtml2Txt::GetCode(char* dst)
265: {
266: strcat(dst,Tcode);
267: }
268:
269:
270:
271: void CHtml2Txt::WriteCode()
272: {
273: if(cmdopt.Hdst == DST_FILE){
274: if(!strcmp(cmdopt.fOut,std_out)){
275: fprintf(stderr,"**%s\n\toutput file is not detected.\n",h2tprog);
276: return;
277: }
278: fprintf(stderr,"**%s\n\t%s -> %s\n",h2tprog,cmdopt.fIn,cmdopt.fOut);
279: }
280:
281:
282:
283: if(!cmdopt.Hdst){
284: if((OFP=fopen(cmdopt.fOut,"w"))==NULL){
285: fprintf(stderr,"output file[%s]: cannot writable.\n",cmdopt.fOut);
286: return;
287: }
288: } else OFP = stdout;
289:
290:
291:
292: TextOut();
293:
294: fclose(OFP);
295: }
296:
297:
298:
299: void CHtml2Txt::TextOut()
300: {
301: fwrite(Tcode,sizeof(char),strlen(Tcode),OFP);
302: }
303:
304:
305:
306: int CHtml2Txt::Initbuf(int buflen)
307: {
308: tci = 0; swQuote = 0;
309: Tcode = (char*)calloc(buflen,sizeof(char));
310: memset(Tcode,0,buflen);
311: if(Tcode == NULL) return 0;
312: return 1;
313: }
314:
315:
316:
317: void CHtml2Txt::strpush(char* buf)
318: {
319: for(int i=0;i<(signed)strlen(buf);i++){
320: Tcode[tci++] = buf[i];
321: if(swQuote) if(buf[i]=='\n')
322: Tcode[tci++] = '\t';
323: }
324: }
325:
326: void CHtml2Txt::charpush(char buf)
327: {
328: Tcode[tci++] = buf;
329: if(swQuote) if(buf=='\n')
330: Tcode[tci++] = '\t';
331: }
332:
333:
334:
335: int CHtml2Txt::rep_key(char* h_code,int iPos,char* sym,char* repstr)
336: {
337: if(Kwd_sch(h_code,fcLen,iPos,sym)){
338: strpush(repstr);
339: return 1;
340: }
341: return 0;
342: }
343:
344:
345:
346: int CHtml2Txt::rep_tag(char* h_code,int iPos,char* tag,char* repstr)
347: {
348: int endc = h_code[iPos+1+strlen(tag)];
349: if(Kwd_sch(h_code,fcLen,iPos+1,tag) &&
350: (h_code[iPos] == '<' ) &&
351: (endc == ' ' || endc == '>') ){
352: strpush(repstr);
353: return strlen(tag);
354: }
355: return 0;
356: }
357:
358:
359:
360: int CHtml2Txt::ch_title(char* h_code,int iPos)
361: {
362: if(Kwd_sch(h_code,fcLen,iPos,"<title>")) return 1;
363: return 0;
364: }
365: int CHtml2Txt::get_title(char* h_code,int iPos)
366: {
367: int i=iPos+6,iStart=1,tcnt=0;
368: memset(DocTitle,0,sizeof(char)*256);
369: while(iStart && i++<fcLen){
370: if(Kwd_sch(h_code,fcLen,i,"</title>")){
371: iStart = 0; i+=7;
372: DocTitle[tcnt] = 0;
373: strpush("Title: ");
374: strpush(DocTitle);
375: charpush('\n');
376: fprintf(stderr,"%s: [%s]\n",cmdopt.fIn,DocTitle);
377: } else {
378: DocTitle[tcnt++] = h_code[i];
379:
380: }
381: }
382: return (i-iPos);
383: }
384:
385:
386:
387: int CHtml2Txt::ch_line(char* h_code,int iPos)
388: {
389: if(Kwd_sch(h_code,fcLen,iPos,"<hr ") ||
390: Kwd_sch(h_code,fcLen,iPos,"<hr>") ||
391: Kwd_sch(h_code,fcLen,iPos,"<hr\n")) {
392: return 1;
393: }
394: return 0;
395: }
396: int CHtml2Txt::rep_line(char* h_code,int iPos)
397: {
398: int i=iPos,iStart=1;
399: unsigned char c;
400: strpush("<hr");
401: i+=2;
402: while(iStart && i++<fcLen){
403: c = h_code[i];
404: if(c == '>') { iStart=0; }
405: if(c == '\n') { charpush(' '); continue; }
406: charpush(c);
407: }
408: strpush("<p>\n");
409: return (i-iPos);
410: }
411:
412:
413:
414: int CHtml2Txt::ch_link(char* h_code,int iPos)
415: {
416: if(Kwd_sch(h_code,fcLen,iPos,"<a ")) return 1;
417: if(Kwd_sch(h_code,fcLen,iPos,"<a\n")) return 1;
418: return 0;
419: }
420: int CHtml2Txt::rep_link(char* h_code,int iPos)
421: {
422: int i=iPos,iStart=1,urlcnt=0,ref_start=0;
423: unsigned char c;
424: char url[MAX_PATH];
425: strpush("<a ");
426: i+=2;
427: while(iStart && i++<fcLen){
428: c = h_code[i];
429: if(c == '>') { iStart=0; }
430: if(c == '\n') { charpush(' '); continue; }
431: if(Kwd_sch(h_code,fcLen,i,"href=")){
432: urlcnt = 0; ref_start=1;
433: memset(url,0,sizeof(char)*MAX_PATH);
434: strpush("href=");
435: i+=4;
436: continue;
437: }
438: if(ref_start){
439: if(!urlcnt && c=='"') { continue; }
440: if(c != '"' && c != ' ' && c != '>') {
441: url[urlcnt++]=c;
442: continue;
443: } else {
444: ref_start = 0;
445: charpush('"');
446: if(cmdopt.url == BASE_URL){
447: if(!Kwd_sch(url,strlen(url),0,"http://") &&
448: !Kwd_sch(url,strlen(url),0,"mailto:") &&
449: !Kwd_sch(url,strlen(url),0,"javascript:") ) {
450: strpush(BaseURL);
451: charpush('/');
452: if(url[0] == '#'){
453: strpush(BaseFile);
454: }
455: strpush(url);
456: } else strpush(url);
457: } else strpush(url);
458: charpush('"');
459: if(c=='>') charpush('>');
460: continue;
461: }
462: }
463: charpush(c);
464: }
465: return (i-iPos);
466: }
467:
468:
469:
470: int CHtml2Txt::ch_image(char* h_code,int iPos)
471: {
472: if(Kwd_sch(h_code,fcLen,iPos,"<img ")) return 1;
473: if(Kwd_sch(h_code,fcLen,iPos,"<img\n")) return 1;
474: return 0;
475: }
476: int CHtml2Txt::rep_image(char* h_code,int iPos)
477: {
478: int i=iPos,iStart=1,imgcnt=0,src_start=0;
479: unsigned char c;
480: char img[MAX_PATH];
481: strpush("<img ");
482: i+=4;
483: while(iStart && i++<fcLen){
484: c = h_code[i];
485: if(c == '>') { iStart=0; }
486: if(c == '\n') { charpush(' '); continue; }
487: if(Kwd_sch(h_code,fcLen,i,"src=")){
488: imgcnt = 0; src_start=1;
489: memset(img,0,sizeof(char)*MAX_PATH);
490: strpush("src=");
491: i+=3;
492: continue;
493: }
494: if(src_start){
495: if(!imgcnt && c=='"') continue;
496: if(c != '\"' && c != ' ' && c != '>') {
497: img[imgcnt++]=c;
498: continue;
499: } else {
500: src_start = 0;
501: charpush('"');
502: if(cmdopt.url == BASE_URL){
503: if(!strstr(img,"http://")){
504: strpush(BaseURL);
505: charpush('/');
506: strpush(img);
507: } else strpush(img);
508: } else strpush(img);
509: charpush('"');
510: if(c=='>') charpush('>');
511: continue;
512: }
513: }
514: charpush(c);
515: }
516: return (i-iPos);
517: }
518:
519:
520:
521: int CHtml2Txt::ch_region(char* h_code,int iPos,int tagi)
522: {
523: unsigned char c = h_code[iPos],ec;
524: if(c!='<') return 0;
525: ec = h_code[iPos+1+strlen(skip_tag[tagi])];
526: if(Kwd_sch(h_code,fcLen,iPos+1,skip_tag[tagi]) &&
527: (ec == ' ' || ec == '>')){
528: return 1;
529: }
530: return 0;
531: }
532: int CHtml2Txt::region_skip(char* h_code,int iPos,int tagi)
533: {
534: int i=iPos,iStart=1;
535: char endstr[128];
536: sprintf(endstr,"</%s>",skip_tag[tagi]);
537: i+=strlen(skip_tag[tagi])+1;
538: while(iStart && i++<fcLen){
539: if(Kwd_sch(h_code,fcLen,i,endstr)){
540: iStart = 0;
541: }
542: }
543: i += strlen(endstr)-1;
544: return (i-iPos);
545: }
546:
547:
548:
549: void CHtml2Txt::html2txt()
550: {
551: int i,iPos,ltop,iStart=0;
552: int swPre=0,swComment=0;
553: unsigned char c,nxt_c,pre_c=0;
554:
555: for(iPos=0;iPos<fcLen;pre_c=Hcode[iPos++]){
556: c = Hcode[iPos];
557: nxt_c = (iPos<fcLen-1)?Hcode[iPos+1]:0;
558: if(c == '\n' && !swPre) continue;
559:
560:
561:
562: if(pre_c == '\n'){
563: ltop = line_schtop(Hcode,fcLen,iPos);
564: if(ltop && Hcode[iPos+ltop] == '<'){
565: iPos += (ltop-1);
566: continue;
567: }
568: }
569:
570:
571:
572: if(ch_title(Hcode,iPos)){
573: iPos += get_title(Hcode,iPos);
574: continue;
575: }
576:
577:
578:
579: iPos += rep_tag(Hcode,iPos,"h1","\n");
580: iPos += rep_tag(Hcode,iPos,"/h1","\n");
581: iPos += rep_tag(Hcode,iPos,"h2","\n");
582: iPos += rep_tag(Hcode,iPos,"/h2","\n");
583: iPos += rep_tag(Hcode,iPos,"h3","\n");
584: iPos += rep_tag(Hcode,iPos,"/h3","\n");
585: iPos += rep_tag(Hcode,iPos,"h4","\n");
586: iPos += rep_tag(Hcode,iPos,"/h4","\n");
587:
588: iPos += rep_tag(Hcode,iPos,"cite","\"");
589: iPos += rep_tag(Hcode,iPos,"/cite","\"");
590:
591: iPos += rep_tag(Hcode,iPos,"ul","\n");
592: iPos += rep_tag(Hcode,iPos,"ol","\n");
593: iPos += rep_tag(Hcode,iPos,"dl","\n");
594: iPos += rep_tag(Hcode,iPos,"li","\n\t・");
595: iPos += rep_tag(Hcode,iPos,"dt","\n\t・");
596:
597: iPos += rep_tag(Hcode,iPos,"/tr","\n");
598: iPos += rep_tag(Hcode,iPos,"div","\n");
599: iPos += rep_tag(Hcode,iPos,"/div","\n");
600: iPos += rep_tag(Hcode,iPos,"p","\n\n");
601: iPos += rep_tag(Hcode,iPos,"/p","\n");
602: iPos += rep_tag(Hcode,iPos,"br","\n");
603: iPos += rep_tag(Hcode,iPos,"/br","\n");
604:
605:
606:
607:
608:
609:
610: if(Kwd_sch(Hcode,fcLen,iPos,"<blockquote>")) swQuote = 1;
611: if(Kwd_sch(Hcode,fcLen,iPos,"</blockquote>")) swQuote = 0;
612:
613:
614:
615: if(Kwd_sch(Hcode,fcLen,iPos,"<pre>")) swPre = 1;
616: if(Kwd_sch(Hcode,fcLen,iPos,"</pre>")) swPre = 0;
617:
618:
619:
620: for(i=0;i<SKIPTAG_N;i++){
621: if(ch_region(Hcode,iPos,i)){
622: iPos += region_skip(Hcode,iPos,i);
623: continue;
624: }
625: }
626:
627:
628:
629: if(cmdopt.link==SW_LINK){
630: if(ch_line(Hcode,iPos)){
631: iPos += rep_line(Hcode,iPos);
632: continue;
633: }
634: }
635:
636:
637:
638:
639:
640: if(cmdopt.link==SW_LINK){
641: if(ch_image(Hcode,iPos)){
642: iPos += rep_image(Hcode,iPos);
643: continue;
644: }
645: }
646:
647: if(cmdopt.link==SW_LINK){
648: if(ch_link(Hcode,iPos)){
649: iPos += rep_link(Hcode,iPos);
650: continue;
651: }
652: if(Kwd_sch(Hcode,fcLen,iPos,"</a>")){
653: strpush("</a>");
654: iPos += 3;
655: continue;
656: }
657: }
658:
659:
660:
661: if(c == '<'){
662: if(nxt_c != '\0'){
663: if(nxt_c>='a' && nxt_c<='z' ) iStart=1;
664: if(nxt_c>='A' && nxt_c<='Z' ) iStart=1;
665: if(nxt_c=='/' ) iStart=1;
666: if(nxt_c=='?' ) iStart=1;
667: if(nxt_c=='!'){
668: if(Kwd_sch(Hcode,fcLen,iPos,"<!--")){
669: swComment = 1; strpush("\nComment:\n\t");
670: iPos += 3; continue;
671: } else {
672: iStart = 1;
673: }
674: }
675: }
676: }
677:
678: if(swComment && Kwd_sch(Hcode,fcLen,iPos,"-->")){
679: swComment = 0; strpush("\n");
680: iPos += 2; continue;
681: }
682:
683: if(c=='>' && iStart==1) {
684: iStart=0; continue;
685: }
686:
687:
688:
689: if(iStart==0){
690: if(c=='&'){
691: if(rep_key(Hcode,iPos,"<","<")) { iPos+=3; continue; }
692: if(rep_key(Hcode,iPos,">",">")) { iPos+=3; continue; }
693: if(rep_key(Hcode,iPos,"&","&")) { iPos+=4; continue; }
694: if(rep_key(Hcode,iPos,""","\"")) { iPos+=5; continue; }
695: if(rep_key(Hcode,iPos,"Á","")){ iPos+=7; continue; }
696: if(rep_key(Hcode,iPos,"Í","。")){ iPos+=7; continue; }
697: if(rep_key(Hcode,iPos,"Ð","ム")) { iPos+=4; continue; }
698: if(rep_key(Hcode,iPos,"Ñ","、")){ iPos+=7; continue; }
699: if(rep_key(Hcode,iPos,"Ó","「")){ iPos+=7; continue; }
700: if(rep_key(Hcode,iPos,"Ø","0")){ iPos+=7; continue; }
701: if(rep_key(Hcode,iPos,"Ú","」")){ iPos+=7; continue; }
702: if(rep_key(Hcode,iPos,"Ý","Y")){ iPos+=7; continue; }
703: if(rep_key(Hcode,iPos,"á","")){ iPos+=7; continue; }
704: if(rep_key(Hcode,iPos,"í","。")){ iPos+=7; continue; }
705: if(rep_key(Hcode,iPos,"ð","ム")) { iPos+=4; continue; }
706: if(rep_key(Hcode,iPos,"ñ","、")){ iPos+=7; continue; }
707: if(rep_key(Hcode,iPos,"ó","「")){ iPos+=7; continue; }
708: if(rep_key(Hcode,iPos,"ø","0")){ iPos+=7; continue; }
709: if(rep_key(Hcode,iPos,"ú","」")){ iPos+=7; continue; }
710: if(rep_key(Hcode,iPos,"ý","Y")){ iPos+=7; continue; }
711: if(rep_key(Hcode,iPos,"®","(r)")) { iPos+=4; continue; }
712: if(rep_key(Hcode,iPos,"©","(c)")){ iPos+=5; continue; }
713: if(rep_key(Hcode,iPos,"™","tm")){ iPos+=6; continue; }
714: if(rep_key(Hcode,iPos," "," ")) { iPos+=5; continue; }
715: }
716:
717: if(c=='\r' && nxt_c=='\n'){
718: c = '\n'; iPos++;
719: }
720: charpush(c);
721: }
722: }
723: charpush('\0');
724: }