After trying out a few options, I think the easier way to do it with large scale is to use elinks.
in ubuntu:
sudo apt-get install elinks
elinks -dump a.html > a.txt
Answer from formatjam on Stack OverflowConvert HTML to Plain Text using c++ - Stack Overflow
How can I Convert HTML to Text in C#? - Stack Overflow
In C, how can one convert HTML strings to C strings? - Stack Overflow
how to change this html code to c++ code - C++ Forum
Can I convert HTML to C automatically?
Should I rewrite from scratch or use a converter for HTML to C?
What HTML features don't have a direct C equivalent?
Videos
After trying out a few options, I think the easier way to do it with large scale is to use elinks.
in ubuntu:
sudo apt-get install elinks
elinks -dump a.html > a.txt
I post the c++ version for Windows which originally came from @Ben Anderson's C# solution. Note, the code isn't quite robust yet. Also all the leading and ending newlines would be trimmed.
// The trimming method comes from https://stackoverflow.com/a/1798170/1613961
wstring trim(const std::wstring& str, std::wstring& newline = L"\r\n")
{
const auto strBegin = str.find_first_not_of(newline);
if (strBegin == std::string::npos)
return L""; // no content
const auto strEnd = str.find_last_not_of(newline);
const auto strRange = strEnd - strBegin + 1;
return str.substr(strBegin, strRange);
}
wstring HtmlToText(wstring htmlTxt) {
std::wregex stripFormatting(L"<[^>]*(>|$)"); //match any character between '<' and '>', even when end tag is missing
wstring s1 = std::regex_replace(htmlTxt, stripFormatting, L"");
wstring s2 = trim(s1);
wstring s3 = std::regex_replace(s2, std::wregex(L"\\ "), L" ");
return s3;
}
Just a note about the HtmlAgilityPack for posterity. The project contains an example of parsing text to html, which, as noted by the OP, does not handle whitespace at all like anyone writing HTML would envisage. There are full-text rendering solutions out there, noted by others to this question, which this is not (it cannot even handle tables in its current form), but it is lightweight and fast, which is all I wanted for creating a simple text version of HTML emails.
using System.IO;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs
public static class HtmlToText
{
public static string Convert(string path)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(path);
return ConvertDoc(doc);
}
public static string ConvertHtml(string html)
{
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(html);
return ConvertDoc(doc);
}
public static string ConvertDoc (HtmlDocument doc)
{
using (StringWriter sw = new StringWriter())
{
ConvertTo(doc.DocumentNode, sw);
sw.Flush();
return sw.ToString();
}
}
internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
foreach (HtmlNode subnode in node.ChildNodes)
{
ConvertTo(subnode, outText, textInfo);
}
}
public static void ConvertTo(HtmlNode node, TextWriter outText)
{
ConvertTo(node, outText, new PreceedingDomTextInfo(false));
}
internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
{
string html;
switch (node.NodeType)
{
case HtmlNodeType.Comment:
// don't output comments
break;
case HtmlNodeType.Document:
ConvertContentTo(node, outText, textInfo);
break;
case HtmlNodeType.Text:
// script and style must not be output
string parentName = node.ParentNode.Name;
if ((parentName == "script") || (parentName == "style"))
{
break;
}
// get text
html = ((HtmlTextNode)node).Text;
// is it in fact a special closing node output as text?
if (HtmlNode.IsOverlappedClosingElement(html))
{
break;
}
// check the text is meaningful and not a bunch of whitespaces
if (html.Length == 0)
{
break;
}
if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
{
html= html.TrimStart();
if (html.Length == 0) { break; }
textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
}
outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));
if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
{
outText.Write(' ');
}
break;
case HtmlNodeType.Element:
string endElementString = null;
bool isInline;
bool skip = false;
int listIndex = 0;
switch (node.Name)
{
case "nav":
skip = true;
isInline = false;
break;
case "body":
case "section":
case "article":
case "aside":
case "h1":
case "h2":
case "header":
case "footer":
case "address":
case "main":
case "div":
case "p": // stylistic - adjust as you tend to use
if (textInfo.IsFirstTextOfDocWritten)
{
outText.Write("\r\n");
}
endElementString = "\r\n";
isInline = false;
break;
case "br":
outText.Write("\r\n");
skip = true;
textInfo.WritePrecedingWhiteSpace = false;
isInline = true;
break;
case "a":
if (node.Attributes.Contains("href"))
{
string href = node.Attributes["href"].Value.Trim();
if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1)
{
endElementString = "<" + href + ">";
}
}
isInline = true;
break;
case "li":
if(textInfo.ListIndex>0)
{
outText.Write("\r\n{0}.\t", textInfo.ListIndex++);
}
else
{
outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022
}
isInline = false;
break;
case "ol":
listIndex = 1;
goto case "ul";
case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
endElementString = "\r\n";
isInline = false;
break;
case "img": //inline-block in reality
if (node.Attributes.Contains("alt"))
{
outText.Write('[' + node.Attributes["alt"].Value);
endElementString = "]";
}
if (node.Attributes.Contains("src"))
{
outText.Write('<' + node.Attributes["src"].Value + '>');
}
isInline = true;
break;
default:
isInline = true;
break;
}
if (!skip && node.HasChildNodes)
{
ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });
}
if (endElementString != null)
{
outText.Write(endElementString);
}
break;
}
}
}
internal class PreceedingDomTextInfo
{
public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
{
IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
}
public bool WritePrecedingWhiteSpace {get;set;}
public bool LastCharWasSpace { get; set; }
public readonly BoolWrapper IsFirstTextOfDocWritten;
public int ListIndex { get; set; }
}
internal class BoolWrapper
{
public BoolWrapper() { }
public bool Value { get; set; }
public static implicit operator bool(BoolWrapper boolWrapper)
{
return boolWrapper.Value;
}
public static implicit operator BoolWrapper(bool boolWrapper)
{
return new BoolWrapper{ Value = boolWrapper };
}
}
As an example, the following HTML code...
<!DOCTYPE HTML>
<html>
<head>
</head>
<body>
<header>
Whatever Inc.
</header>
<main>
<p>
Thanks for your enquiry. As this is the 1<sup>st</sup> time you have contacted us, we would like to clarify a few things:
</p>
<ol>
<li>
Please confirm this is your email by replying.
</li>
<li>
Then perform this step.
</li>
</ol>
<p>
Please solve this <img alt="complex equation" src="http://upload.wikimedia.org/wikipedia/commons/8/8d/First_Equation_Ever.png"/>. Then, in any order, could you please:
</p>
<ul>
<li>
a point.
</li>
<li>
another point, with a <a href="http://en.wikipedia.org/wiki/Hyperlink">hyperlink</a>.
</li>
</ul>
<p>
Sincerely,
</p>
<p>
The whatever.com team
</p>
</main>
<footer>
Ph: 000 000 000<br/>
mail: whatever st
</footer>
</body>
</html>
...will be transformed into:
Whatever Inc.
Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:
1. Please confirm this is your email by replying.
2. Then perform this step.
Please solve this [complex equation<http://upload.wikimedia.org/wikipedia/commons/8/8d/First_Equation_Ever.png>]. Then, in any order, could you please:
* a point.
* another point, with a hyperlink<http://en.wikipedia.org/wiki/Hyperlink>.
Sincerely,
The whatever.com team
Ph: 000 000 000
mail: whatever st
...as opposed to:
Whatever Inc.
Thanks for your enquiry. As this is the 1st time you have contacted us, we would like to clarify a few things:
Please confirm this is your email by replying.
Then perform this step.
Please solve this . Then, in any order, could you please:
a point.
another point, with a hyperlink.
Sincerely,
The whatever.com team
Ph: 000 000 000
mail: whatever st
You could use this:
public static string StripHTML(string HTMLText, bool decode = true)
{
Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);
var stripped = reg.Replace(HTMLText, "");
return decode ? HttpUtility.HtmlDecode(stripped) : stripped;
}
Updated
Thanks for the comments I have updated to improve this function
This isn't particularly hard, assuming you only care about &#xx; style entities. The bare-bones, let-everyone-else-worry-about-the-memory-management, mechanical, what's-a-regex way:
int hex_to_value(char hex) {
if (hex >= '0' && hex <= '9') { return hex - '0'; }
if (hex >= 'A' && hex <= 'F') { return hex - 'A' + 10; }
if (hex >= 'a' && hex <= 'f') { return hex - 'f' + 10; }
return -1;
}
void unescape(char* dst, const char* src) {
// Write the translated version of the text at 'src', to 'dst'.
// All sequences of '&#xx;', where x is a hex digit, are replaced
// with the corresponding single byte.
enum { NONE, AND, AND_HASH, AND_HASH_EX, AND_HASH_EX_EX } mode;
char first_hex, second_hex, translated;
mode m = NONE;
while (*src) {
char c = *src++;
switch (m) {
case NONE:
if (c == '&') { m = AND; }
else { *dst++ = c; m = NONE; }
break;
case AND:
if (c == '#') { m = AND_HASH; }
else { *dst++ = '&'; *dst++ = c; m = NONE; }
break;
case AND_HASH:
translated = hex_to_value(c);
if (translated != -1) { first_hex = c; m = AND_HASH_EX; }
else { *dst++ = '&'; *dst++ = '#'; *dst++ = c; m = NONE; }
break;
case AND_HASH_EX:
translated = hex_to_value(c);
if (translated != -1) {
second_hex = c;
translated = hex_to_value(first_hex) << 4 | translated;
m = AND_HASH_EX_EX;
} else {
*dst++ = '&'; *dst++ = '#'; *dst++ = first_hex; *dst++ = c;
m = NONE;
}
break;
case AND_HASH_EX_EX:
if (c == ';') { *dst++ = translated; }
else {
*dst++ = '&'; *dst++ = '#';
*dst++ = first_hex; *dst++ = second_hex; *dst++ = c;
}
m = NONE;
break;
}
}
}
Tedious, and way more code than seems reasonable, but not hard :)
I'd try to parse the number out from the string and then convert it to a number using atoi and then cast it to a character.
This is something I wrote in ~20 seconds so it's completely contrived:
char html[] = "'";
char* pch = &html[2];
int n = 0;
char c = 0;
pch[2] = '\0';
n = atoi(pch);
c = n;
now c is '. Also I don't really know about html strings... so I might be missing something
I was trying to find a way, both elegant and simple, to generate html pages in C when I finally came up with this solution, using open_memstream, curly braces and some macros...
EDIT: updated with Eternal_Weeb's comment.
#include <stdio.h>
#include <stdlib.h>
#include "html_tags.h"
typedef struct {
char *user_name;
int task_count;
char **tasks;
} user_tasks;
void user_tasks_html(FILE *fp, user_tasks *data) {
{
DOCTYPE;
HTML("en") {
HEAD() {
META("charset='utf-8'");
META("name='viewport' "
"content='width=device-width, initial-scale=1'");
TITLE("Index page");
META("name='description' content='Description'");
META("name='author' content='Author'");
META("property='og:title' content='Title'");
LINK("rel='icon' href='/favicon.svg' type='image/svg+xml'");
LINK("rel='stylesheet' href='css/styles.css'");
}
BODY("") {
DIV("id='main'") {
H1("id='title'") { _("Hello %s", data->user_name); }
if (data->task_count > 0) {
UL("class='default'") {
for (int i = 0; i < data->task_count; i++) {
LI("class='default'") {
_("Task %d: %s", i + 1, data->tasks[i]);
}
}
}
}
}
}
SCRIPT("js/main.js");
}
}
}
int main(void) {
user_tasks data;
{
data.user_name = "John";
data.task_count = 3;
data.tasks = calloc(data.task_count, sizeof(char *));
{
data.tasks[0] = "Feed the cat";
data.tasks[1] = "Clean the room";
data.tasks[2] = "Go to the gym";
}
}
char *html;
size_t html_size;
FILE *fp;
fp = open_memstream(&html, &html_size);
if (fp == NULL) {
return 1;
}
user_tasks_html(fp, &data);
fclose(fp);
printf("%s\n", html);
printf("%lu bytes\n", html_size);
free(html);
free(data.tasks);
return 0;
}html_tags.h:
#ifndef HTML_TAGS_H_
#define HTML_TAGS_H_
#define SCOPE(atStart, atEnd) for (int _scope_break = ((atStart), 1); _scope_break; _scope_break = ((atEnd), 0))
#define DOCTYPE fputs("<!DOCTYPE html>", fp)
#define HTML(lang) SCOPE(fprintf(fp, "<html lang='%s'>", lang), fputs("</html>", fp))
#define HEAD() SCOPE(fputs("<head>", fp), fputs("</head>",fp))
#define TITLE(text) fprintf(fp, "<title>%s</title>", text)
#define META(attributes) fprintf(fp, "<meta %s>", attributes)
#define LINK(attributes) fprintf(fp, "<link %s>", attributes)
#define SCRIPT(src) fprintf(fp, "<script src='%s'></script>", src)
#define BODY(attributes) SCOPE(fprintf(fp, "<body %s>", attributes), fputs("</body>", fp))
#define DIV(attributes) SCOPE(fprintf(fp, "<div %s>", attributes), fputs("</div>", fp))
#define UL(attributes) SCOPE(fprintf(fp, "<ul %s>", attributes), fputs("</ul>", fp))
#define OL(attributes) SCOPE(fprintf(fp, "<ol %s>", attributes), fputs("</ol>", fp))
#define LI(attributes) SCOPE(fprintf(fp, "<li %s>", attributes), fputs("</li>", fp))
#define BR fputs("<br>", fp)
#define _(...) fprintf(fp, __VA_ARGS__)
#define H1(attributes) SCOPE(fprintf(fp, "<h1 %s>", attributes), fputs("</h1>", fp))
#define H2(attributes) SCOPE(fprintf(fp, "<h2 %s>", attributes), fputs("</h2>", fp))
#define H3(attributes) SCOPE(fprintf(fp, "<h3 %s>", attributes), fputs("</h3>", fp))
#define H4(attributes) SCOPE(fprintf(fp, "<h4 %s>", attributes), fputs("</h4>", fp))
#define H5(attributes) SCOPE(fprintf(fp, "<h5 %s>", attributes), fputs("</h5>", fp))
#define H6(attributes) SCOPE(fprintf(fp, "<h6 %s>", attributes), fputs("</h6>", fp))
#define P(content) fprintf(fp, "<p>%s</p>", content)
#define A(href, content) fprintf(fp, "<a href='%s'>%s</a>", href, content)
#define IMG(attributes) fprintf(fp, "<img %s>", attributes)
#define HR fputs("<hr/>", fp)
#define TABLE(attributes) SCOPE(fprintf(fp, "<table %s>", attributes), fputs("</table>", fp)
#define TR(attributes) SCOPE(fprintf(fp, "<tr %s>", attributes), fputs("</tr>", fp))
#define TD(attributes) SCOPE(fprintf(fp, "<td %s>", attributes), fputs("</td>", fp))
#define TH(attributes) SCOPE(fprintf(fp, "<th %s>", attributes), fputs("</th>", fp))
#define FORM(attributes) SCOPE(fprintf(fp, "<form %s>", attributes), fputs("</form>", fp))
#define INPUT(attributes) fprintf(fp, "<input %s>", attributes)
#define OPTION(attributes, content) fprintf(fp, "<option %s>%s</option>", attributes, content)
#endif