libopc
opc_text.c

Sample program which will extract all text form an Word document and dump it as HTML.

/*
Extract all text of an Word document as HTML.
Ussage:
opc_text FILENAME
Sample:
opc_text OOXMLI1.docx
*/
#include <opc/opc.h>
#include <stdio.h>
#include <time.h>
#ifdef WIN32
#include <crtdbg.h>
#endif
static void dumpText(mceTextReader_t *reader) {
mce_start_element(reader, _X("http://schemas.openxmlformats.org/wordprocessingml/2006/main"), _X("t")) {
mce_start_text(reader) {
for(const xmlChar *txt=xmlTextReaderConstValue(reader->reader);0!=*txt;txt++) {
switch(*txt) {
case '<':
printf("&lt;");
break;
case '>':
printf("&gt;");
break;
case '&':
printf("&amp;");
break;
default:
putc(*txt, stdout);
break;
}
}
} mce_end_text(reader);
} mce_end_children(reader);
} mce_end_element(reader);
mce_start_element(reader, _X("http://schemas.openxmlformats.org/wordprocessingml/2006/main"), _X("p")) {
printf("<p>");
dumpText(reader);
printf("</p>\n");
} mce_end_element(reader);
mce_start_element(reader, NULL, NULL) {
dumpText(reader);
} mce_end_element(reader);
} mce_end_children(reader);
}
int main( int argc, const char* argv[] )
{
#ifdef WIN32
_CrtSetDbgFlag (_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
#endif
opcContainer *c=opcContainerOpen(_X(argv[1]), OPC_OPEN_READ_ONLY, NULL, NULL);
if (NULL!=c) {
if (OPC_ERROR_NONE==opcXmlReaderOpen(c, &reader, _X("/word/document.xml"), NULL, 0, 0)) {
mce_start_document(&reader) {
mce_start_element(&reader, NULL, NULL) {
printf("<html>\n");
printf("<head>\n");
printf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">\n");
printf("</head>\n");
printf("<body>\n");
dumpText(&reader);
printf("<body>\n");
printf("</html>\n");
} mce_end_element(&reader);
} mce_end_document(&reader);
}
}
#ifdef WIN32
OPC_ASSERT(!_CrtDumpMemoryLeaks());
#endif
return 0;
}