antiword/antiword-0.37-docx.patch

183 lines
4.9 KiB
Diff
Raw Permalink Normal View History

Description: Try to reduce confusion around docx files
Now also checks for XML files and HTML files
Author: Olly Betts <olly@survex.com>
Bug-Debian: https://bugs.debian.org/758959
Bug-Debian: https://bugs.debian.org/791532
Forwarded: no
Last-Update: 2015-01-11
--- a/Docs/antiword.1
+++ b/Docs/antiword.1
@@ -14,7 +14,11 @@
.br
A wordfile named - stands for a Word document read from the standard input.
.br
-Only documents made by MS Word version 2 and version 6 or later are supported.
+Only the binary format documents made by MS Word version 2, 6, 7, 97, 2000 and
+2003 are supported. Newer Word versions default to using a completely
+different format consisting of XML files in a ZIP container (usually with a
+".docx" file extension) which antiword doesn't support. It also doesn't
+support the "flat" XML format which MS Word 2003 supported.
.SH OPTIONS
.TP
.BI "\-a " papersize
--- a/antiword.h
+++ b/antiword.h
@@ -695,6 +695,9 @@
extern BOOL bIsWordForDosFile(FILE *, long);
extern BOOL bIsRtfFile(FILE *);
extern BOOL bIsWordPerfectFile(FILE *);
+extern BOOL bIsZipFile(FILE *);
+extern BOOL bIsXMLFile(FILE *);
+extern BOOL bIsHTMLFile(FILE *);
extern BOOL bIsWinWord12File(FILE *, long);
extern BOOL bIsMacWord45File(FILE *);
extern int iGuessVersionNumber(FILE *, long);
--- a/main_u.c
+++ b/main_u.c
@@ -187,10 +187,29 @@
werr(0, "%s is not a Word Document."
" It is probably a Rich Text Format file",
szFilename);
- } if (bIsWordPerfectFile(pFile)) {
+ } else if (bIsWordPerfectFile(pFile)) {
werr(0, "%s is not a Word Document."
" It is probably a Word Perfect file",
szFilename);
+ } else if (bIsZipFile(pFile)) {
+ werr(0, "%s is not a Word Document."
+ " It seems to be a ZIP file, so is probably"
+ " an OpenDocument file, or a \"docx\" file"
+ " from MS Word 2007 or newer"
+ " (antiword only handles binary format"
+ " documents from MS Word 2003 and earlier)",
+ szFilename);
+ } else if (bIsXMLFile(pFile)) {
+ werr(0, "%s is not a Word Document."
+ " It seems to be an XML file, perhaps"
+ " the XML format from MS Word 2003"
+ " (antiword only handles binary format"
+ " documents from MS Word 2003 and earlier)",
+ szFilename);
+ } else if (bIsHTMLFile(pFile)) {
+ werr(0, "%s is not a Word Document."
+ " It is probably an HTML file",
+ szFilename);
} else {
#if defined(__dos)
werr(0, "%s is not a Word Document or the filename"
--- a/wordlib.c
+++ b/wordlib.c
@@ -41,7 +41,7 @@
BOOL
bIsWordForDosFile(FILE *pFile, long lFilesize)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab }; /* Word for DOS */
DBG_MSG("bIsWordForDosFile");
@@ -64,7 +64,7 @@
static BOOL
bIsWordFileWithOLE(FILE *pFile, long lFilesize)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
int iTailLen;
@@ -108,7 +108,7 @@
BOOL
bIsRtfFile(FILE *pFile)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ '{', '\\', 'r', 't', 'f', '1' };
DBG_MSG("bIsRtfFile");
@@ -122,7 +122,7 @@
BOOL
bIsWordPerfectFile(FILE *pFile)
{
- static UCHAR aucBytes[] =
+ static const UCHAR aucBytes[] =
{ 0xff, 'W', 'P', 'C' };
DBG_MSG("bIsWordPerfectFile");
@@ -131,13 +131,65 @@
} /* end of bIsWordPerfectFile */
/*
+ * This function checks whether the given file is or is not a ZIP file
+ */
+BOOL
+bIsZipFile(FILE *pFile)
+{
+ static const UCHAR aucBytes[] =
+ { 'P', 'K', 0x03, 0x04 };
+
+ DBG_MSG("bIsZipFile");
+
+ return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsZipFile */
+
+/*
+ * This function checks whether the given file is or is not a XML file
+ */
+BOOL
+bIsXMLFile(FILE *pFile)
+{
+ static const UCHAR aucBytes[] =
+ { '<', '?', 'x', 'm', 'l' };
+
+ DBG_MSG("bIsXMLFile");
+
+ return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
+} /* end of bIsXMLFile */
+
+/*
+ * This function checks whether the given file is or is not a HTML file
+ */
+BOOL
+bIsHTMLFile(FILE *pFile)
+{
+ static const UCHAR aucBytes[2][5] = {
+ { '<', 'h', 't', 'm', 'l' },
+ { '<', 'H', 'T', 'M', 'L' },
+ };
+ int iIndex;
+
+ DBG_MSG("bIsHTMLFile");
+
+ for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
+ if (bCheckBytes(pFile,
+ aucBytes[iIndex],
+ elementsof(aucBytes[iIndex]))) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+} /* end of bIsHTMLFile */
+
+/*
* This function checks whether the given file is or is not a "Win Word 1 or 2"
* document
*/
BOOL
bIsWinWord12File(FILE *pFile, long lFilesize)
{
- static UCHAR aucBytes[2][4] = {
+ static const UCHAR aucBytes[2][4] = {
{ 0x9b, 0xa5, 0x21, 0x00 }, /* Win Word 1.x */
{ 0xdb, 0xa5, 0x2d, 0x00 }, /* Win Word 2.0 */
};
@@ -171,7 +223,7 @@
BOOL
bIsMacWord45File(FILE *pFile)
{
- static UCHAR aucBytes[2][6] = {
+ static const UCHAR aucBytes[2][6] = {
{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 }, /* Mac Word 4 */
{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 }, /* Mac Word 5 */
};