diff --git a/README.md b/README.md index d711515..170e5e2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ # html2text -html2text is a command line utility, written in C++, that converts HTML documents into plain text. +html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). diff --git a/html2text-1.3.2a-utf8.patch b/html2text-1.3.2a-utf8.patch deleted file mode 100644 index 442d187..0000000 --- a/html2text-1.3.2a-utf8.patch +++ /dev/null @@ -1,706 +0,0 @@ -diff -r -u -bB html2text-1.3.2a/Area.C html2text-1.3.2a-patched/Area.C ---- html2text-1.3.2a/Area.C 2003-11-23 12:05:29.000000000 +0100 -+++ html2text-1.3.2a-patched/Area.C 2005-05-13 22:19:59.862137688 +0200 -@@ -36,10 +36,13 @@ - #include - - #include "Area.h" -+#include "html.h" - #include "string.h" - - #define LATIN1_nbsp 160 - -+extern int use_encoding; -+ - /* ------------------------------------------------------------------------- */ - - #define malloc_array(type, size)\ -@@ -81,6 +84,27 @@ - - /* ------------------------------------------------------------------------- */ - -+/* utf_length() and utf_width() -+ * -+ * Very simplified algorithm of calculating length of UTF-8 -+ * string. No check for errors. Counting only ASCII bytes and -+ * leading bytes of UTF-8 multibyte sequences. All bytes like -+ * 10xxxxxx are dropped. If USE_UTF8 is false then returns -+ * usual length. --YS -+ */ -+ -+unsigned int -+Line::utf_length(size_type f, size_type t) const -+{ -+ size_type m = (t < length_ ? t : length_); -+ size_type r = m - f; -+ if(USE_UTF8) { -+ for (int i = f; i < m; i++) -+ if((cells_[i].character & 0xc0) == 0x80) r--; -+ } -+ return r; -+} -+ - void - Line::resize(size_type l) - { -@@ -236,6 +260,23 @@ - return *this; - } - -+unsigned int -+Area::utf_width() -+{ -+ size_type r = width_; -+ if(USE_UTF8) { r = 0; -+ for (size_type yy = 0; yy < height_; yy++) { -+ size_type r1 = 0; -+ for (int i = width_ - 1; i >= 0; i--) { -+ if(!r1 && isspace(cells_[yy][i].character)) continue; -+ if((cells_[yy][i].character & 0xc0) != 0x80) r1++; -+ } -+ if(r < r1) r = r1; -+ } -+ } -+ return r; -+} -+ - void - Area::resize(size_type w, size_type h) - { -@@ -439,7 +480,7 @@ - char c = p->character; - char a = p->attribute; - -- if (c == (char) LATIN1_nbsp) c = ' '; -+ if (c == (char) LATIN1_nbsp && !USE_UTF8) c = ' '; - - if (a == Cell::NONE) { - os << c; -Nur in html2text-1.3.2a-patched/: Area.C.orig. -diff -r -u -bB html2text-1.3.2a/Area.h html2text-1.3.2a-patched/Area.h ---- html2text-1.3.2a/Area.h 2003-11-23 12:05:29.000000000 +0100 -+++ html2text-1.3.2a-patched/Area.h 2005-05-13 22:19:59.863137536 +0200 -@@ -81,6 +81,8 @@ - Cell &operator[](size_type x) { return cells_[x]; } - const Cell *cells() const { return cells_; } - -+ unsigned int utf_length(size_type f, size_type t) const; -+ - void resize(size_type l); - void enlarge(size_type l) { if (l > length_) resize(l); } - -@@ -134,6 +136,8 @@ - Cell *operator[](size_type y) { return cells_[y]; } - const Area &operator>>=(size_type rs); - -+ unsigned int utf_width(); -+ - void resize(size_type w, size_type h); - void enlarge(size_type w, size_type h); - -Nur in html2text-1.3.2a-patched/: Area.h.orig. -diff -r -u -bB html2text-1.3.2a/format.C html2text-1.3.2a-patched/format.C ---- html2text-1.3.2a/format.C 2003-11-23 12:05:29.000000000 +0100 -+++ html2text-1.3.2a-patched/format.C 2005-05-13 22:19:59.865137232 +0200 -@@ -1210,6 +1210,7 @@ - } - - Line::size_type to = from + 1; -+ int to_from; - - Line::size_type lbp = (Line::size_type) -1; // "Last break position". - -@@ -1238,18 +1239,20 @@ - to++; - } - -- if (to - from > w && lbp != (Area::size_type) -1) { to = lbp; break; } -+ if (line.utf_length(from,to) > w && lbp != (Area::size_type) -1) -+ { to = lbp; break; } - } - -+ to_from = line.utf_length(from,to); - /* - * Copy the "from...to" range from the "line" to the bottom of the "res" - * Area. - */ - Area::size_type x = 0; - Area::size_type len = to - from; -- if (halign == Area::LEFT || len >= w) { ; } else -- if (halign == Area::CENTER) { x += (w - len) / 2; } else -- if (halign == Area::RIGHT) { x += w - len; } -+ if (halign == Area::LEFT || to_from >= w) { ; } else -+ if (halign == Area::CENTER) { x += (w - to_from) / 2; } else -+ if (halign == Area::RIGHT) { x += w - to_from; } - res->insert(line.cells() + from, len, x, res->height()); - - /* -Nur in html2text-1.3.2a-patched/: format.C.orig. -diff -r -u -bB html2text-1.3.2a/html2text.C html2text-1.3.2a-patched/html2text.C ---- html2text-1.3.2a/html2text.C 2003-11-23 12:05:29.000000000 +0100 -+++ html2text-1.3.2a-patched/html2text.C 2005-05-13 22:19:59.868136776 +0200 -@@ -148,9 +148,10 @@ - -o Redirect output into \n\ - -nobs Do not use backspaces for boldface and underlining\n\ - -ascii Use plain ASCII for output instead of ISO-8859-1\n\ -+ -utf8 Assume both terminal and input stream are in UTF-8 mode\n\ - "; - --int use_iso8859 = 1; -+int use_encoding = ISO8859; - - int - main(int argc, char **argv) -@@ -199,7 +200,8 @@ - if (!strcmp(arg, "-width" )) { width = atoi(argv[++i]); } else - if (!strcmp(arg, "-o" )) { output_file_name = argv[++i]; } else - if (!strcmp(arg, "-nobs" )) { use_backspaces = false; } else -- if (!strcmp(arg, "-ascii" )) { use_iso8859 = false; } else -+ if (!strcmp(arg, "-ascii" )) { use_encoding = ASCII; } else -+ if (!strcmp(arg, "-utf8" )) { use_encoding = UTF8; } else - { - std::cerr - << "Unrecognized command line option \"" -Nur in html2text-1.3.2a-patched/: html2text.C.orig. -diff -r -u -bB html2text-1.3.2a/html.h html2text-1.3.2a-patched/html.h ---- html2text-1.3.2a/html.h 2001-10-04 22:03:54.000000000 +0200 -+++ html2text-1.3.2a-patched/html.h 2005-05-13 22:19:59.866137080 +0200 -@@ -61,6 +61,11 @@ - - /* ------------------------------------------------------------------------- */ - -+enum {ASCII, ISO8859, UTF8}; -+#define USE_ISO8859 (use_encoding == ISO8859) -+#define USE_ASCII (use_encoding == ASCII) -+#define USE_UTF8 (use_encoding == UTF8) -+ - #define LATIN1_nbsp 160 - #define LATIN1_iexcl 161 - #define LATIN1_cent 162 -diff -r -u -bB html2text-1.3.2a/sgml.C html2text-1.3.2a-patched/sgml.C ---- html2text-1.3.2a/sgml.C 2003-11-23 12:09:11.000000000 +0100 -+++ html2text-1.3.2a-patched/sgml.C 2005-05-13 22:19:59.870136472 +0200 -@@ -62,261 +62,280 @@ - char name[8]; - int iso8859code; - char *asciistr; -+ unsigned long unicode; - } entities[] = { -- { "AElig", LATIN1_AElig, "AE" }, -- { "AMP", 0, "&" }, -- { "Aacute", LATIN1_Aacute, "A'" }, -- { "Acirc", LATIN1_Acirc, "A^" }, -- { "Agrave", LATIN1_Agrave, "A`" }, -- { "Alpha", 0, "A" }, -- { "Aring", LATIN1_Aring, "AA" }, -- { "Atilde", LATIN1_Atilde, "A~" }, -- { "Auml", LATIN1_Auml, "A\"" }, -- { "Beta", 0, "B" }, -- { "Ccedil", LATIN1_Ccedil, "C," }, -- { "Chi", 0, "H" }, -- { "Dagger", 0, "++" }, -- { "Delta", 0, "D" }, -- { "ETH", LATIN1_ETH, "D-" }, -- { "Eacute", LATIN1_Eacute, "E'" }, -- { "Ecirc", LATIN1_Ecirc, "E^" }, -- { "Egrave", LATIN1_Egrave, "E`" }, -- { "Epsilon", 0, "E" }, -- { "Eta", 0, "E" }, -- { "Euml", LATIN1_Euml, "E\"" }, -- { "GT", 0, ">" }, -- { "Gamma", 0, "G" }, -- { "Iacute", LATIN1_Iacute, "I'" }, -- { "Icirc", LATIN1_Icirc, "I^" }, -- { "Igrave", LATIN1_Igrave, "I`" }, -- { "Iota", 0, "I" }, -- { "Iuml", LATIN1_Iuml, "I\"" }, -- { "Kappa", 0, "K" }, -- { "LT", 0, "<" }, -- { "Lambda", 0, "L" }, -- { "Mu", 0, "M" }, -- { "Ntilde", LATIN1_Ntilde, "N~" }, -- { "Nu", 0, "N" }, -- { "OElig", 0, "OE" }, -- { "Oacute", LATIN1_Oacute, "O'" }, -- { "Ocirc", LATIN1_Ocirc, "O^" }, -- { "Ograve", LATIN1_Ograve, "O`" }, -- { "Omega", 0, "O" }, -- { "Omicron", 0, "O" }, -- { "Oslash", LATIN1_Oslash, "O/" }, -- { "Otilde", LATIN1_Otilde, "O~" }, -- { "Ouml", LATIN1_Ouml, "O\"" }, -- { "Phi", 0, "F" }, -- { "Pi", 0, "P" }, -- { "Prime", 0, "''" }, -- { "Psi", 0, "PS" }, -- { "QUOT", 0, "\"" }, -- { "Rho", 0, "R" }, -- { "Scaron", 0, "S" }, -- { "Sigma", 0, "S" }, -- { "THORN", LATIN1_THORN, "TH" }, -- { "Tau", 0, "T" }, -- { "Theta", 0, "TH" }, -- { "Uacute", LATIN1_Uacute, "U'" }, -- { "Ucirc", LATIN1_Ucirc, "U^" }, -- { "Ugrave", LATIN1_Ugrave, "U`" }, -- { "Upsilon", 0, "U" }, -- { "Uuml", LATIN1_Uuml, "U\"" }, -- { "Xi", 0, "X" }, -- { "Yacute", LATIN1_Yacute, "Y'" }, -- { "Yuml", 0, "Y\"" }, -- { "Zeta", 0, "Z" }, -- { "aacute", LATIN1_aacute, "a'" }, -- { "acirc", LATIN1_acirc, "a^" }, -- { "acute", LATIN1_acute, "'" }, -- { "aelig", LATIN1_aelig, "ae" }, -- { "agrave", LATIN1_agrave, "a`" }, -+ { "AElig", LATIN1_AElig, "AE", 0x00c6}, -+ { "AMP", 0, "&", 0x0026}, -+ { "Aacute", LATIN1_Aacute, "A'", 0x00c1}, -+ { "Acirc", LATIN1_Acirc, "A^", 0x00c2}, -+ { "Agrave", LATIN1_Agrave, "A`", 0x00c0}, -+ { "Alpha", 0, "A", 0x0391}, -+ { "Aring", LATIN1_Aring, "AA", 0x00c5}, -+ { "Atilde", LATIN1_Atilde, "A~", 0x00c3}, -+ { "Auml", LATIN1_Auml, "A\"", 0x00c4}, -+ { "Beta", 0, "B", 0x0392}, -+ { "Ccedil", LATIN1_Ccedil, "C,", 0x00c7}, -+ { "Chi", 0, "H", 0x03a7}, -+ { "Dagger", 0, "++", 0x2020}, -+ { "Delta", 0, "D", 0x0394}, -+ { "ETH", LATIN1_ETH, "D-", 0x00d0}, -+ { "Eacute", LATIN1_Eacute, "E'", 0x00c9}, -+ { "Ecirc", LATIN1_Ecirc, "E^", 0x00ca}, -+ { "Egrave", LATIN1_Egrave, "E`", 0x00c8}, -+ { "Epsilon", 0, "E", 0x0395}, -+ { "Eta", 0, "E", 0x0397}, -+ { "Euml", LATIN1_Euml, "E\"", 0x00cb}, -+ { "GT", 0, ">", 0x003e}, -+ { "Gamma", 0, "G", 0x0393}, -+ { "Iacute", LATIN1_Iacute, "I'", 0x00cd}, -+ { "Icirc", LATIN1_Icirc, "I^", 0x00ce}, -+ { "Igrave", LATIN1_Igrave, "I`", 0x00cc}, -+ { "Iota", 0, "I", 0x0399}, -+ { "Iuml", LATIN1_Iuml, "I\"", 0x00cf}, -+ { "Kappa", 0, "K", 0x039a}, -+ { "LT", 0, "<", 0x003c}, -+ { "Lambda", 0, "L", 0x039b}, -+ { "Mu", 0, "M", 0x039c}, -+ { "Ntilde", LATIN1_Ntilde, "N~", 0x00d1}, -+ { "Nu", 0, "N", 0x039d}, -+ { "OElig", 0, "OE", 0x0152}, -+ { "Oacute", LATIN1_Oacute, "O'", 0x00d3}, -+ { "Ocirc", LATIN1_Ocirc, "O^", 0x00d4}, -+ { "Ograve", LATIN1_Ograve, "O`", 0x00d2}, -+ { "Omega", 0, "O", 0x03a9}, -+ { "Omicron", 0, "O", 0x039f}, -+ { "Oslash", LATIN1_Oslash, "O/", 0x00d8}, -+ { "Otilde", LATIN1_Otilde, "O~", 0x00d5}, -+ { "Ouml", LATIN1_Ouml, "O\"", 0x00d6}, -+ { "Phi", 0, "F", 0x03a6}, -+ { "Pi", 0, "P", 0x03a0}, -+ { "Prime", 0, "''", }, -+ { "Psi", 0, "PS", 0x03a8}, -+ { "QUOT", 0, "\"", }, -+ { "Rho", 0, "R", 0x03a1}, -+ { "Scaron", 0, "S", 0x0161}, -+ { "Sigma", 0, "S", 0x03a3}, -+ { "THORN", LATIN1_THORN, "TH", 0x00de}, -+ { "Tau", 0, "T", 0x03a4}, -+ { "Theta", 0, "TH", 0x0398}, -+ { "Uacute", LATIN1_Uacute, "U'", 0x00da}, -+ { "Ucirc", LATIN1_Ucirc, "U^", 0x00db}, -+ { "Ugrave", LATIN1_Ugrave, "U`", 0x00d9}, -+ { "Upsilon", 0, "U", 0x03a5}, -+ { "Uuml", LATIN1_Uuml, "U\"", 0x00dc}, -+ { "Xi", 0, "X", 0x039e}, -+ { "Yacute", LATIN1_Yacute, "Y'", 0x00dd}, -+ { "Yuml", 0, "Y\"", 0x0178}, -+ { "Zeta", 0, "Z", 0x0396}, -+ { "aacute", LATIN1_aacute, "a'", 0x00e1}, -+ { "acirc", LATIN1_acirc, "a^", 0x00e2}, -+ { "acute", LATIN1_acute, "'", 0x00b4}, -+ { "aelig", LATIN1_aelig, "ae", 0x00e6}, -+ { "agrave", LATIN1_agrave, "a`", 0x00e0}, - { "alefsym", 0, "Aleph" }, -- { "alpha", 0, "a" }, -+ { "alpha", 0, "a", 0x03b1}, - { "amp", 0, "&" }, - { "and", 0, "AND" }, - { "ang", 0, "-V" }, - { "apos", 0, "'" }, -- { "aring", LATIN1_aring, "aa" }, -- { "asymp", 0, "~=" }, -- { "atilde", LATIN1_atilde, "a~" }, -- { "auml", LATIN1_auml, "a\"" }, -+ { "aring", LATIN1_aring, "aa", 0x00e5}, -+ { "asymp", 0, "~=", 0x2248}, -+ { "atilde", LATIN1_atilde, "a~", 0x00e3}, -+ { "auml", LATIN1_auml, "a\"", 0x00e5}, - { "bdquo", 0, "\"" }, -- { "beta", 0, "b" }, -- { "brvbar", LATIN1_brvbar, "|" }, -- { "bull", 0, " o " }, -+ { "beta", 0, "b", 0x03b2}, -+ { "brvbar", LATIN1_brvbar, "|", 0x00a6}, -+ { "bull", 0, " o ", 0x2022}, - { "cap", 0, "(U" }, -- { "ccedil", LATIN1_ccedil, "c," }, -- { "cedil", LATIN1_cedil, "," }, -- { "cent", LATIN1_cent, "-c-" }, -- { "chi", 0, "h" }, -- { "circ", 0, "^" }, -+ { "ccedil", LATIN1_ccedil, "c,", 0x00e7}, -+ { "cedil", LATIN1_cedil, ",", 0x00b8}, -+ { "cent", LATIN1_cent, "-c-", 0x00a2}, -+ { "chi", 0, "h", 0x03c7}, -+ { "circ", 0, "^", 0x005e}, - // { "clubs", 0, "[clubs]" }, - { "cong", 0, "?=" }, -- { "copy", LATIN1_copy, "(c)" }, -+ { "copy", LATIN1_copy, "(c)", 0x00a9}, - { "crarr", 0, "<-'" }, - { "cup", 0, ")U" }, -- { "curren", LATIN1_curren, "CUR" }, -+ { "curren", LATIN1_curren, "CUR", 0x00a4}, - { "dArr", 0, "vv" }, -- { "dagger", 0, "+" }, -+ { "dagger", 0, "+", 0x2020}, - { "darr", 0, "v" }, -- { "deg", LATIN1_deg, "DEG" }, -- { "delta", 0, "d" }, -+ { "deg", LATIN1_deg, "DEG", 0x00b0}, -+ { "delta", 0, "d", 0x03b4}, - // { "diams", 0, "[diamonds]" }, -- { "divide", LATIN1_divide, "/" }, -- { "eacute", LATIN1_eacute, "e'" }, -- { "ecirc", LATIN1_ecirc, "e^" }, -- { "egrave", LATIN1_egrave, "e`" }, -+ { "divide", LATIN1_divide, "/", 0x00f7}, -+ { "eacute", LATIN1_eacute, "e'", 0x00e9}, -+ { "ecirc", LATIN1_ecirc, "e^", 0x00ea}, -+ { "egrave", LATIN1_egrave, "e`", 0x00e8}, - { "empty", 0, "{}" }, -- { "epsilon", 0, "e" }, -- { "equiv", 0, "==" }, -- { "eta", 0, "e" }, -- { "eth", LATIN1_eth, "d-" }, -- { "euml", LATIN1_euml, "e\"" }, -- { "euro", 0, "EUR" }, -+ { "epsilon", 0, "e", 0x03b5}, -+ { "equiv", 0, "==", 0x2261}, -+ { "eta", 0, "e", 0x03b7}, -+ { "eth", LATIN1_eth, "d-", 0x00f0}, -+ { "euml", LATIN1_euml, "e\"", 0x00eb}, -+ { "euro", 0, "EUR", 0x20ac}, - { "exist", 0, "TE" }, - { "fnof", 0, "f" }, - { "forall", 0, "FA" }, -- { "frac12", LATIN1_frac12, " 1/2" }, -- { "frac14", LATIN1_frac14, " 1/4" }, -- { "frac34", LATIN1_frac34, " 3/4" }, -+ { "frac12", LATIN1_frac12, " 1/2",0x00bd}, -+ { "frac14", LATIN1_frac14, " 1/4",0x00bc}, -+ { "frac34", LATIN1_frac34, " 3/4",0x00be}, - { "frasl", 0, "/" }, -- { "gamma", 0, "g" }, -- { "ge", 0, ">=" }, -- { "gt", 0, ">" }, -+ { "gamma", 0, "g", 0x03b3}, -+ { "ge", 0, ">=", 0x2265}, -+ { "gt", 0, ">", 0x003e}, - { "hArr", 0, "<=>" }, - { "harr", 0, "<->" }, - // { "hearts", 0, "[hearts]" }, -- { "hellip", 0, "..." }, -- { "iacute", LATIN1_iacute, "i'" }, -- { "icirc", LATIN1_icirc, "i^" }, -- { "iexcl", LATIN1_iexcl, "!" }, -- { "igrave", LATIN1_igrave, "i`" }, -+ { "hellip", 0, "...", 0x2026}, -+ { "iacute", LATIN1_iacute, "i'", 0x00ed}, -+ { "icirc", LATIN1_icirc, "i^", 0x00ee}, -+ { "iexcl", LATIN1_iexcl, "!", 0x00a1}, -+ { "igrave", LATIN1_igrave, "i`", 0x00ec}, - { "image", 0, "Im" }, -- { "infin", 0, "oo" }, -- { "int", 0, "INT" }, -- { "iota", 0, "i" }, -- { "iquest", LATIN1_iquest, "?" }, -+ { "infin", 0, "oo", 0x221e}, -+ { "int", 0, "INT", 0x222b}, -+ { "iota", 0, "i", 0x03b9}, -+ { "iquest", LATIN1_iquest, "?", 0x00bf}, - { "isin", 0, "(-" }, -- { "iuml", LATIN1_iuml, "i\"" }, -- { "kappa", 0, "k" }, -+ { "iuml", LATIN1_iuml, "i\"", 0x00ef}, -+ { "kappa", 0, "k", 0x03ba}, - { "lArr", 0, "<=" }, -- { "lambda", 0, "l" }, -+ { "lambda", 0, "l", 0x03bb}, - { "lang", 0, "" }, - { "lsaquo", 0, "<" }, - { "lsquo", 0, "`" }, -- { "lt", 0, "<" }, -- { "macr", LATIN1_macr, "-" }, -+ { "lt", 0, "<", 0x003c}, -+ { "macr", LATIN1_macr, "-", 0x00af}, - { "mdash", 0, "--" }, -- { "micro", LATIN1_micro, "my" }, -- { "middot", LATIN1_middot, "." }, -- { "minus", 0, "-" }, -- { "mu", 0, "m" }, -+ { "micro", LATIN1_micro, "my", 0x00b5}, -+ { "middot", LATIN1_middot, ".", 0x00b7}, -+ { "minus", 0, "-", 0x2212}, -+ { "mu", 0, "m", 0x03bc}, - { "nabla", 0, "Nabla" }, -- { "nbsp", LATIN1_nbsp, " " }, -+ { "nbsp", LATIN1_nbsp, " ", 0x00a0}, - { "ndash", 0, "-" }, -- { "ne", 0, "!=" }, -+ { "ne", 0, "!=", 0x2260}, - { "ni", 0, "-)" }, - { "not", LATIN1_not, "NOT" }, - { "notin", 0, "!(-" }, - { "nsub", 0, "!(C" }, -- { "ntilde", LATIN1_ntilde, "n~" }, -- { "nu", 0, "n" }, -- { "oacute", LATIN1_oacute, "o'" }, -- { "ocirc", LATIN1_ocirc, "o^" }, -+ { "ntilde", LATIN1_ntilde, "n~", 0x00f1}, -+ { "nu", 0, "n", 0x03bd}, -+ { "oacute", LATIN1_oacute, "o'", 0x00f3}, -+ { "ocirc", LATIN1_ocirc, "o^", 0x00f4}, - { "oelig", 0, "oe" }, -- { "ograve", LATIN1_ograve, "o`" }, -+ { "ograve", LATIN1_ograve, "o`", 0x00f2}, - { "oline", LATIN1_macr, "-" }, -- { "omega", 0, "o" }, -- { "omicron", 0, "o" }, -+ { "omega", 0, "o", 0x03c9}, -+ { "omicron", 0, "o", 0x03bf}, - { "oplus", 0, "(+)" }, - { "or", 0, "OR" }, -- { "ordf", LATIN1_ordf, "-a" }, -- { "ordm", LATIN1_ordm, "-o" }, -- { "oslash", LATIN1_oslash, "o/" }, -- { "otilde", LATIN1_otilde, "o~" }, -+ { "ordf", LATIN1_ordf, "-a", 0x00aa}, -+ { "ordm", LATIN1_ordm, "-o", 0x00ba}, -+ { "oslash", LATIN1_oslash, "o/", 0x00f8}, -+ { "otilde", LATIN1_otilde, "o~", 0x00f5}, - { "otimes", 0, "(x)" }, -- { "ouml", LATIN1_ouml, "o\"" }, -- { "para", LATIN1_para, "P:" }, -- { "part", 0, "PART" }, -- { "permil", 0, " 0/00" }, -+ { "ouml", LATIN1_ouml, "o\"", 0x00f6}, -+ { "para", LATIN1_para, "P:", 0x00b6}, -+ { "part", 0, "PART",0x2202}, -+ { "permil", 0, " 0/00",0x2030}, - { "perp", 0, "-T" }, -- { "phi", 0, "f" }, -- { "pi", 0, "p" }, -+ { "phi", 0, "f", 0x03c6}, -+ { "pi", 0, "p", 0x03c0}, - { "piv", 0, "Pi" }, -- { "plusmn", LATIN1_plusmn, "+/-" }, -- { "pound", LATIN1_pound, "-L-" }, -+ { "plusmn", LATIN1_plusmn, "+/-", 0x00b1}, -+ { "pound", LATIN1_pound, "-L-", 0x00a3}, - { "prime", 0, "'" }, -- { "prod", 0, "PROD" }, -+ { "prod", 0, "PROD",0x220f}, - { "prop", 0, "0(" }, -- { "psi", 0, "ps" }, -+ { "psi", 0, "ps", 0x03c8}, - { "quot", 0, "\"" }, - { "rArr", 0, "=>" }, -- { "radic", 0, "SQRT" }, -+ { "radic", 0, "SQRT",0x221a}, - { "rang", 0, "/>" }, - { "raquo", LATIN1_raquo, ">>" }, -- { "rarr", 0, "->" }, -+ { "rarr", 0, "->", 0x2192}, - // { "rceil", 0, ">|" }, - { "rdquo", 0, "\"" }, - { "real", 0, "Re" }, -- { "reg", LATIN1_reg, "(R)" }, -+ { "reg", LATIN1_reg, "(R)", 0x00ae}, - // { "rfloor", 0, "|>" }, -- { "rho", 0, "r" }, -+ { "rho", 0, "r", 0x03c1}, - { "rsaquo", 0, ">" }, - { "rsquo", 0, "'" }, - { "sbquo", 0, "'" }, -- { "scaron", 0, "s" }, -+ { "scaron", 0, "s", 0x0161}, - { "sdot", 0, "DOT" }, -- { "sect", LATIN1_sect, "S:" }, -+ { "sect", LATIN1_sect, "S:", 0x00a7}, - { "shy", LATIN1_shy, "" }, -- { "sigma", 0, "s" }, -- { "sigmaf", 0, "s" }, -+ { "sigma", 0, "s", 0x03c3}, -+ { "sigmaf", 0, "s", 0x03c2}, - { "sim", 0, "~" }, - // { "spades", 0, "[spades]" }, - { "sub", 0, "(C" }, - { "sube", 0, "(_" }, -- { "sum", 0, "SUM" }, -+ { "sum", 0, "SUM", 0x2211}, - { "sup", 0, ")C" }, -- { "sup1", LATIN1_sup1, "^1" }, -- { "sup2", LATIN1_sup2, "^2" }, -- { "sup3", LATIN1_sup3, "^3" }, -+ { "sup1", LATIN1_sup1, "^1", 0x00b9}, -+ { "sup2", LATIN1_sup2, "^2", 0x00b2}, -+ { "sup3", LATIN1_sup3, "^3", 0x00b3}, - { "supe", 0, ")_" }, -- { "szlig", LATIN1_szlig, "ss" }, -- { "tau", 0, "t" }, -+ { "szlig", LATIN1_szlig, "ss", 0x00df}, -+ { "tau", 0, "t", 0x03c4}, - { "there4", 0, ".:" }, -- { "theta", 0, "th" }, -- { "thorn", LATIN1_thorn, "th" }, -- { "tilde", 0, "~" }, -- { "times", LATIN1_times, "x" }, -- { "trade", 0, "[TM]" }, -+ { "theta", 0, "th", 0x03b8}, -+ { "thorn", LATIN1_thorn, "th", 0x00fe}, -+ { "tilde", 0, "~", 0x02dc}, -+ { "times", LATIN1_times, "x", 0x00d7}, -+ { "trade", 0, "[TM]",0x2122}, - { "uArr", 0, "^^" }, -- { "uacute", LATIN1_uacute, "u'" }, -+ { "uacute", LATIN1_uacute, "u'", 0x00fa}, - { "uarr", 0, "^" }, -- { "ucirc", LATIN1_ucirc, "u^" }, -- { "ugrave", LATIN1_ugrave, "u`" }, -- { "uml", LATIN1_uml, "\"" }, -- { "upsilon", 0, "u" }, -- { "uuml", LATIN1_uuml, "u\"" }, -+ { "ucirc", LATIN1_ucirc, "u^", 0x00fb}, -+ { "ugrave", LATIN1_ugrave, "u`", 0x00f9}, -+ { "uml", LATIN1_uml, "\"", 0x00a8}, -+ { "upsilon", 0, "u", 0x03c5}, -+ { "uuml", LATIN1_uuml, "u\"", 0x00fc}, - { "weierp", 0, "P" }, -- { "xi", 0, "x" }, -- { "yacute", LATIN1_yacute, "y'" }, -- { "yen", LATIN1_yen, "YEN" }, -- { "yuml", LATIN1_yuml, "y\"" }, -- { "zeta", 0, "z" }, -+ { "xi", 0, "x", 0x03be}, -+ { "yacute", LATIN1_yacute, "y'", 0x00fd}, -+ { "yen", LATIN1_yen, "YEN", 0x00a5}, -+ { "yuml", LATIN1_yuml, "y\"", 0x00ff}, -+ { "zeta", 0, "z", 0x03b6}, - }; - --extern int use_iso8859; -+extern int use_encoding; - - /* ------------------------------------------------------------------------- */ - -+char ubuf[4]; -+ -+char *mkutf(unsigned long x) -+{ -+ memset(ubuf, 0, 4); -+ if(x < 128) ubuf[0] = x; -+ else if(x < 0x800) { -+ ubuf[0] = (0xc0 | ((x >> 6) & 0x1f)); -+ ubuf[1] = (0x80 | (x & 0x3f)); -+ } -+ else { -+ ubuf[0] = (0xe0 | ((x >> 12) & 0x0f)); -+ ubuf[1] = (0x80 | ((x >> 6) & 0x3f)); -+ ubuf[2] = (0x80 | (x & 0x3f)); -+ } -+ return ubuf; -+} -+ - void - replace_sgml_entities(string *s) - { -@@ -330,9 +349,9 @@ - */ - while (j < l && s->at(j) != '&') ++j; - /* -- * We could convert high-bit chars to "é" here if use_iso8859 -- * is off, then let them be translated or not. Is the purpose of -- * !use_iso8859 to allow SGML entities to be seen, or to strongly -+ * We could convert high-bit chars to "é" here if USE_ASCII -+ * is on, then let them be translated or not. Is the purpose of -+ * USE_ASCII to allow SGML entities to be seen, or to strongly - * filter against high-ASCII chars that might blow up a terminal - * that doesn't speak ISO8859? For the moment, "allow SGML entities - * to be seen" -- no filtering here. -@@ -370,7 +389,11 @@ - if (!isdigit(c)) break; - x = 10 * x + c - '0'; - } -- if (use_iso8859 || (x < 128)) { -+ if (USE_UTF8) { -+ s->replace(beg, j - beg, mkutf(x)); -+ j = beg + 1; -+ } -+ else if (USE_ISO8859 && (x < 256) || USE_ASCII && (x < 128)) { - s->replace(beg, j - beg, 1, (char) x); - j = beg + 1; - } else { -@@ -408,13 +431,17 @@ - (int (*)(const void *, const void *)) strcmp - ); - if (entity != NULL) { -- if (use_iso8859 && entity->iso8859code) { -+ if (USE_ISO8859 && entity->iso8859code) { - s->replace(beg, j - beg, 1, (char) entity->iso8859code); - j = beg + 1; -- } else if (entity->asciistr) { -+ } else if (USE_ASCII && entity->asciistr) { - s->replace(beg, j - beg, entity->asciistr); - j = beg + 1; - } /* else don't replace it at all, we don't have a translation */ -+ else if(USE_UTF8 && entity->unicode) { -+ s->replace(beg, j - beg, mkutf(entity->unicode)); -+ j = beg + 1; -+ } - } - } else { - ; /* EXTENSION: Allow literal '&' sometimes. */ -diff -r -u -bB html2text-1.3.2a/table.C html2text-1.3.2a-patched/table.C ---- html2text-1.3.2a/table.C 2002-07-22 13:32:50.000000000 +0200 -+++ html2text-1.3.2a-patched/table.C 2005-05-13 22:19:59.871136320 +0200 -@@ -175,7 +175,7 @@ - - (*number_of_columns_return - 1) * (column_spacing + 0), - Area::LEFT // Yields better results than "p->halign"! - )); -- p->width = tmp.get() ? tmp->width() : 0; -+ p->width = tmp.get() ? tmp->utf_width() : 0; - } - p->minimized = false; - -@@ -308,7 +308,7 @@ - left_of_column + old_column_width - 1, - Area::LEFT // Yields better results than "lc.halign"! - )); -- w = tmp->width(); -+ w = tmp->utf_width(); - if (w >= left_of_column + old_column_width) lc.minimized = true; - } - if (w > left_of_column + new_column_width) { diff --git a/html2text.spec b/html2text.spec index c28ff9d..289b555 100644 --- a/html2text.spec +++ b/html2text.spec @@ -1,53 +1,53 @@ Name: html2text -Version: 1.3.2a -Release: 3mamba -Summary: html2text is a command line utility, written in C++, that converts HTML documents into plain text. +Version: 2020.1.16 +Release: 1mamba +Summary: Converts a page of HTML into clean, easy-to-read plain ASCII text Group: Graphical Desktop/Applications/Office Vendor: openmamba Distribution: openmamba -Packager: Tiziana Ferro +Packager: Silvan Calarco URL: http://www.mbayer.de/html2text -Source: http://userpage.fu-berlin.de/%7Embayer/tools/%{name}-%{version}.tar.gz -Patch0: %{name}-1.3.2a-utf8.patch +Source: https://pypi.debian.net/html2text/html2text-%{version}.tar.gz +#Source: http://userpage.fu-berlin.de/%7Embayer/tools/%{name}-%{version}.tar.gz License: GPL BuildRoot: %{_tmppath}/%{name}-%{version}-root ## AUTOBUILDREQ-BEGIN -BuildRequires: glibc-devel -BuildRequires: libgcc -BuildRequires: libstdc++6-devel +BuildRequires: libpython3-devel ## AUTOBUILDREQ-END +Requires: python3 >= %python_version %description -html2text is a command line utility, written in C++, that converts HTML documents into plain text. +html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format). %prep %setup -q -%patch0 -p1 %build -%configure --prefix=/usr -%make +CFLAGS="%{optflags}" %{__python3} setup.py build %install [ "%{buildroot}" != / ] && rm -rf "%{buildroot}" -mkdir -p %{buildroot}%{_bindir} -install -m 755 html2text %{buildroot}%{_bindir} -mkdir -p %{buildroot}%{_mandir}/man1 -mkdir -p %{buildroot}%{_mandir}/man5 -install -m 644 html2text.1.gz %{buildroot}%{_mandir}/man1 -install -m 644 html2textrc.5.gz %{buildroot}%{_mandir}/man5 +%{__python3} setup.py install \ + -O1 --skip-build \ + --root="%{buildroot}" \ + --install-headers=%{python3_inc} \ + --install-lib=%{python3_sitearch} \ + --record=%{name}.filelist + +sed -i "s,.*/man/.*,&.gz," %{name}.filelist %clean [ "%{buildroot}" != / ] && rm -rf "%{buildroot}" -%files +%files -f %{name}.filelist %defattr(-,root,root) -%doc CHANGES COPYING CREDITS README TODO -%{_bindir}/html2text -%{_mandir}/man1/html2text.* -%{_mandir}/man5/html2textrc.* +%doc COPYING %changelog +* Tue Mar 03 2020 Silvan Calarco 2020.1.16-1mamba +- update to 2020.1.16 +- switch to python html2text package + * Wed Jul 18 2012 Silvan Calarco 1.3.2a-3mamba - added utf8 patch