How to Normalize (remove rowspan & colspan) a Financial HTML
Table:
Description: Convert Html Financial Table to Its normalized
format taking care of header, Stub and financial cell by marking them.
Header: Header of financial table (marked with head=”true”)
Stub: First Non-financial column of financial table (marked with
stub=”true”)
Subhead: All Cells those are above FirstNumericRowIndex (marked
with stub=”true”)
Or in some case below FirstNumericRowIndex occupying complete row
FirstNumericRowIndex: First Row Index that has
financial value.
FirstNumericColumnIndex: First Col Index that has
financial value.
Tools Used: Html Agility pack
Diagrammatic Representation:
a) Before:
b)
Side
|
Company Abc
|
2012
|
2013
|
Assets
|
$5,000
|
6000
|
ABC
Assets
|
USD300(1)
|
$4,000x
|
|
In Between SubHead
|
Debts
|
2000
|
$10000
|
b) After Normalization:
Side
|
Company Abc
|
|
|
2012
|
2013
|
Assets
|
$5,000
|
6000
|
ABC
Assets
|
USD300(1)
|
$4,000x
|
|
In Between SubHead
|
|
Debts
|
2000
|
$10000
|
Main Calling method:
1) FormatAllTable: start point for normalization. Take input as html
table string.
2) NormalizeTable: Do the process of normalization.
3) InsertCellForRowSpan(): To create cells for normalization.
Supporting methods:
1) GetCleanText()
: To get Clear content
2) CheckFinancialColumn()
: To Check Financial Column
3) GenerateFinancialCellSuperSubScript():
Identify superscript/subscript footnote in financial cell & mark them.(Plz
comment call to this if have no use in your code)
4) GenerateFinancialCell():
To Clean inner html of financial cell adding styles to Cell.
5) SetFirstRowAndColIndex():
To Identify First Row & Col index for Table
6) GetNormalizedCellCountIndex() To get proper index of a
cell excluding colspan.
7) isSubHeadRow() : To identify if a table has in between subhead along
with header sub head
8) IsStringAlpha(): to identify alphabets string
9) VerifyCurrency():
To Check if given column is currency column.
10) CheckNumeric(): To Identify numeric columns
Way of Calling:
1)
String strTable=”table html content”;
2)
Call FormatAllTable method as given below.
Eg. strTable =FormatAllTable(strTable);
Method Definitions:
1) public string
FormatAllTable(string tableHtml)
{
HtmlAgilityPack.HtmlDocument objHTMLdoc = new
HtmlAgilityPack.HtmlDocument();
try
{
int
FirstNumericRowIndex = -1, FirstNumericColumnIndex = -1;
tableHtml = tableHtml.Replace(" ",
"").Replace("\t", "").Replace("\n", "").Replace("\r", "");
objHTMLdoc.LoadHtml(tableHtml);
HtmlNodeCollection
tableList = objHTMLdoc.DocumentNode.SelectNodes("//table");
if (tableList != null)
{
for (int tableIndex =
0; tableIndex < tableList.Count(); tableIndex++)
{
try
{
HtmlNode
tableCopy = tableList[tableIndex];
bool hasSubHead=false;
SetFirstRowAndColIndex(tableCopy, ref
FirstNumericRowIndex, ref
FirstNumericColumnIndex);
tableCopy.InnerHtml = NormalizeTable(tableCopy, FirstNumericRowIndex,
FirstNumericColumnIndex, ref hasSubHead);
}
catch
(Exception ex)
{}
}
}
}
catch
(Exception ex)
{}
return
objHTMLdoc.DocumentNode.OuterHtml; //OutPut
}
2) static string
NormalizeTable(HtmlNode table, int FirstNumericRowIndex, int
FirstNumericColumnIndex, ref bool isSubHeadFound)
{
try
{
int
colCnt = table.FirstChild.ChildNodes.Count;
//int
isHeadInProcessRow = 7;
HtmlNode
trnode = null;
HtmlNode
tdnode = null;
bool
isStubDone = false;
string
content = string.Empty;
//string
TempContent = string.Empty;
//string
TempSymbol = "";
int
trindex = 0, tdindex = 0, mRowSpan = 0, m = 0, mColSpan = 0;
for
(trindex = 0; trindex < table.ChildNodes.Count; trindex++)
{
trnode =
table.ChildNodes[trindex];
if
(trnode.Attributes["subheadrow"]
== null)
{
if (isSubHeadRow(trnode, FirstNumericColumnIndex,
FirstNumericRowIndex, trindex))
{
trnode.Attributes.Add("subheadrow",
"true");
isSubHeadFound = true;
}
}
isStubDone = false;
for
(tdindex = 0; tdindex < trnode.ChildNodes.Count; tdindex++)
{
tdnode =
trnode.ChildNodes[tdindex];
content =
GetCleanText(tdnode.InnerText);//.Replace(" ",
"").Replace(" ",
"").Replace(" ", "").Replace("
", "");
mRowSpan = 0; mColSpan
= 0;
if (tdnode.Attributes["rowspan"]
!= null)
{
if (tdnode.Attributes["rowspan"].Value
!= "1")
{
mRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
InsertCellForRowSpan(ref table, trindex,
tdindex, mRowSpan, tdnode);
tdnode.Attributes["rowspan"].Value
= "1";
tdnode.Attributes.Add("Rowspanremoved",
"true");
tdnode.Attributes.Add("OriginalRowspan",
"" + mRowSpan + "");
}
}
if (tdnode.Attributes["colspan"]
!= null && tdnode.Attributes["colspan"].Value != "1")
{
mColSpan = Convert.ToInt32(tdnode.Attributes["colspan"].Value);
for (m = 0; m < mColSpan - 1; m++)
{
HtmlNode newNode = HtmlNode.CreateNode("<td
style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\"
data-append='true'></td>");
if (tdnode.Attributes["style"]
!= null)
newNode.Attributes["style"].Value
= tdnode.Attributes["style"].Value
+ ";" + newNode.Attributes["style"].Value;
trnode.InsertAfter(newNode, tdnode);
}
tdnode.Attributes["colspan"].Value = "1";
tdnode.Attributes.Add("Colspanremoved",
"true");
tdnode.Attributes.Add("OriginalColspan",
"" + mColSpan + "");
}
if (content != "" ||
tdindex == FirstNumericColumnIndex - 1)
{
if (tdindex < FirstNumericColumnIndex) //tdindex < 3 &&
table.Attributes["excel"] == null
{
if (!CheckFinancialColumn(content) &&
!isStubDone)//&& content.Length > 0 (For
bug)
{
if (mColSpan > trnode.ChildNodes.Count + 1 / 2
&& trindex < FirstNumericRowIndex)
tdnode.Attributes.Add("head", "true");
else
{
tdnode.Attributes.Add("stub",
"true");
if ((trindex == FirstNumericRowIndex - 1 || trindex
== FirstNumericRowIndex - 2) && content ==
GetCleanText(trnode.InnerText))
{
if (trnode.Attributes["BlankStub"]
== null)
trnode.Attributes.Add("BlankStub",
"true");
}
}
isStubDone
= true;
}
}
}
if (tdnode.ChildNodes.Count > 0)
{
if (trindex < FirstNumericRowIndex &&
tdindex >= FirstNumericColumnIndex)
{
tdnode.Attributes.Add("head",
"true");
}
else
{
if (!CheckFinancialColumn(content) && content.Length
> 4 && isStubDone)
{
tdnode.Attributes.Add("subhead",
"true");
}
else if (trindex <
FirstNumericRowIndex && isStubDone)
{
tdnode.Attributes.Add("subhead",
"true");
}
}
if (trindex > 0 && tdindex > 0
&& content.Length < 15 && content != "")
{
if (CheckFinancialColumn(content) || content == "")
{
tdnode.InnerHtml = GenerateFinancialCellSuperSubScript(tdnode);
}
}
trnode.InnerHtml =
GenerateFinancialCell(tdnode, trnode);
}
}
}
}
catch
(Exception ex)
{
}
return
table.InnerHtml;
}
3) static void SetFirstRowAndColIndex(HtmlNode
table, ref int FirstNumericRowIndex,
ref int
FirstNumericColumnIndex)
{
int
i = 0, row = 0;
int
columnCount = GetHighestColumnCount(table);
FirstNumericRowIndex = -1;
string
Content = string.Empty;
try
{
#region Set column Index By Shrikant
for
(i = 1; i < columnCount; i++)
{
for
(row = 0; row < table.ChildNodes.Count; row++)
{
if (i < table.ChildNodes[row].ChildNodes.Count)
{
Content =
GetCleanText(table.ChildNodes[row].ChildNodes[i].InnerText);
if (CheckFinancialColumn(Content))
{
if (table.ChildNodes[row].ChildNodes[i].NextSibling
!= null)
{
Content =
GetCleanText(table.ChildNodes[row].ChildNodes[i].NextSibling.InnerText);
if (Content.Length > 6 || Content.Length == 1)
{
if (!CheckNumeric(Content) &&
!VerifyGutterSymbol(Content, "=="))
continue;
}
}
FirstNumericColumnIndex = i;
break;
}
}
}
if
(FirstNumericColumnIndex > 0)
break;
}
#endregion
#region Set Row Index By Shrikant
for
(row = 0; row < table.ChildNodes.Count; row++)
{
for
(i = FirstNumericColumnIndex; i < columnCount; i++)
{
if (i < table.ChildNodes[row].ChildNodes.Count)
{
if
(CheckFinancialColumn(table.ChildNodes[row].ChildNodes[i].InnerText))
{
FirstNumericRowIndex = row;
break;
}
}
}
if
(FirstNumericRowIndex > -1)
break;
}
#endregion
}
catch
(Exception ex)
{
}
}
4) static bool
CheckFinancialColumn(string CellContent)
{
try
{
CellContent =
GetCleanText(CellContent);
if
(VerifyCurrency(CellContent, "Starts"))
{
CellContent =
CellContent.ToLower().Replace("chf",
"$").Replace("rs", "$").Replace("rs", "$").Replace("—", "—").Replace("—", "—");
if
(CellContent.Length > 4)
{
if (CheckNumeric(CellContent.Substring(1, 3)))
return true;
}
else
{
return true;
}
}
else
if (isYear(CellContent))
{
return
false;
}
else
if (CellContent.Replace("(", "").Replace(")", "").Replace("[", "").Replace("]", "").Replace("{", "").Replace("}", "").Trim().Length
== 1 && (CellContent.Contains("(")
|| CellContent.Contains("[") ||
CellContent.Contains("{")))
{
return
false;
}
return
CheckNumeric(CellContent);
}
catch
{
}
return
false;
}
5) static bool
VerifyCurrency(string CellContent, string sType)
{
string
CleanData = string.Empty;
try
{
if
(CellContent != "" &&
CellContent != "$")
{
for
(int i = 0; i < CurrencySymbols.Length; i++)
{
CellContent =
CellContent.Replace(CurrencySymbols[i], "$");
}
}
CleanData =
CellContent.Replace("(", "").Replace(")",
"").Replace(",", "").Replace("[", "").Replace("]", "").Replace("%", "").Replace("#@super#@", "").Replace("#@sub#@", "");
if
(sType == "==")
{
if
(CellContent == "$")
return true;
}
else
{
if
(CellContent.StartsWith("$")
&& CleanData.Replace("XX",
"").Length < 12)
return true;
else
{
if (CellContent.StartsWith("$"))
{
CleanData =
CleanData.Replace("$", "");
return CheckNumeric(CleanData);
}
}
}
return
false;
}
catch
{
return
false;
}
}
6) static bool
CheckNumeric(string CellContent)
{
try
{
CellContent =
GetCleanText(CellContent);
CellContent =
CellContent.Replace("(", "").Replace(")",
"").Replace("––", "–").Replace("%", "").Replace("--", "-").Replace(".", "").Replace("[", "").Replace("]", "").Replace(".", "").Replace("**", "*").Replace(",", "").Replace("years", "").Replace("year", "");
CellContent =
CellContent.Trim();
if
(CellContent == "*" || CellContent
== "-" || CellContent == "--" || CellContent == "—" || CellContent == "—" || CellContent == "—" ||
CellContent.ToLower().StartsWith("xx")
|| CellContent.ToLower() == "n/a"
|| CellContent.ToLower() == "nm")
return
true;
CellContent =
CellContent.Replace("–", "").Replace("-",
"").Replace("+", "").Replace("#", "").Replace("@", "");
if
(CellContent.Length > 1 && CellContent.Length < 16)
{
if
(VerifyCurrency(CellContent, "Starts"))
CellContent =
CellContent.Substring(1, CellContent.Length - 1);
}
if
(CellContent.Length > 1 && CellContent.Length < 16)
{
if
(CellContent.IndexOf("(") > 2)
{
CellContent =
CellContent.Substring(0, CellContent.IndexOf("("));
}
}
if
(CellContent.Length > 0 && CellContent.Length < 60)
{
Regex
textCheck = new Regex(@"^[0-9]*$", RegexOptions.None);
Match
b = textCheck.Match(CellContent);
return
b.Success;
}
}
catch
{
}
return
false;
}
7) static string GenerateFinancialCellSuperSubScript(HtmlNode CellNode)
{
try
{
string
styleList = null;
string[]
styleLists = null;
string
fontSize = string.Empty;
int
i = 0, j = 0;
if
((CellNode.ChildNodes.Count > 0 && CellNode.ChildNodes.Count < 2)
&& CellNode.Attributes["head"]
== null && CellNode.Attributes["stub"] == null)
{
if
(CellNode.ChildNodes[0].Name == "p")
{
if (CellNode.ChildNodes[0].ChildNodes.Count > 0)
{
for (i = 0; i <
CellNode.ChildNodes[0].ChildNodes.Count; i++)
{
fontSize = string.Empty;
if
(CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:sub") ||
CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:super"))
{
if (CellNode.ChildNodes[0].ChildNodes[i].Attributes["style"] != null)
{
styleList = CellNode.ChildNodes[0].ChildNodes[i].Attributes["style"].Value;
styleLists = styleList.Split(';');
for (j = 0; j < styleLists.Length; j++)
{
if (styleLists[j].Trim().StartsWith("font-size"))
{
fontSize = styleLists[j].Trim().Replace("font-size",
"").Replace(":", "");
if (CellNode.Attributes["SuperSubFontSize"] == null && fontSize != "")
CellNode.Attributes.Add("SuperSubFontSize",
fontSize);
break;
}
}
}
}
if
(CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:sub"))
{
if
(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count > 1)
{
for (int s = 0; s
< CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count; s++)
{
if
(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].OuterHtml.Contains("vertical-align:sub"))
{
CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml
=
CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml.Replace(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText,
"#@sub#@" +
CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText);
break;
}
}
}
else
{
if
(GetCleanText(CellNode.ChildNodes[0].ChildNodes[i].InnerText) != "")
{
if (CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:sub"))
{
CellNode.ChildNodes[0].ChildNodes[i].InnerHtml = "#@sub#@" +
CellNode.ChildNodes[0].ChildNodes[i].InnerText;
break;
}
}
}
}
if
(CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:super"))
{
if
(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count > 1)
{
for (int s = 0; s
< CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count; s++)
{
if (CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].OuterHtml.Contains("vertical-align:super"))
{
CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml = CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml.Replace(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText,
"#@super#@" +
CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText);
break;
}
}
}
else
{
if
(GetCleanText(CellNode.ChildNodes[0].ChildNodes[i].InnerText) != "")
{
CellNode.ChildNodes[0].ChildNodes[i].InnerHtml = CellNode.ChildNodes[0].ChildNodes[i].InnerHtml.Replace(CellNode.ChildNodes[0].ChildNodes[i].InnerText,
"#@super#@" +
CellNode.ChildNodes[0].ChildNodes[i].InnerText);
break;
}
}
}
}
}
}
}
}
catch
(Exception ex)
{
}
return
CellNode.InnerHtml;
}
8) static string GenerateFinancialCell(HtmlNode CellNode, HtmlNode
trNode)
{
try
{
HtmlNode
NewHTMLtd = null;
string
style = string.Empty;
string
Content = string.Empty;
string
mAppendType = string.Empty;
HtmlNode
ParaNode = null;
HtmlNode
spanNode = null;
string[]
spanStyleSub = null;
int
i = 0;
int
mChild = 0;
mAppendType =
(CellNode.Attributes["append-type"]
!= null) ? CellNode.Attributes["append-type"].Value : "content";
if
(CellNode.Attributes["head"] != null || CellNode.Attributes["stub"]
!= null || CellNode.Attributes["subhead"] != null)
NewHTMLtd = HtmlNode.CreateNode("<td data-append=\"true\"
append-type='" + mAppendType + "'>"
+ CellNode.InnerHtml + "</td>");
else
NewHTMLtd = HtmlNode.CreateNode("<td data-append=\"true\"
append-type='" + mAppendType + "'>"
+ CellNode.InnerText + "</td>");
foreach
(HtmlAttribute atr in
CellNode.Attributes)
{
var
found = NewHTMLtd.Attributes.Where(x => x.Name.ToLower() ==
atr.Name.ToLower());
if
(found.Count() == 0)
NewHTMLtd.Attributes.Add(atr);
}
if
(NewHTMLtd.Attributes["style"] != null)
style =
NewHTMLtd.Attributes["style"].Value;
if
(CellNode.Attributes["head"] == null && CellNode.Attributes["stub"] == null
&& CellNode.Attributes["subhead"]
== null)
{
if
(CellNode.ChildNodes.Count > 0)
{
for (mChild = CellNode.ChildNodes.Count - 1; mChild > -1;
mChild--)
{
if
(GetCleanText(CellNode.ChildNodes[mChild].InnerText) != "") break;
}
if (mChild == CellNode.ChildNodes.Count || mChild < 0) mChild =
0;
ParaNode =
CellNode.ChildNodes[mChild];
if (ParaNode.Name == "div")
{
if (CellNode.ChildNodes[mChild].HasChildNodes)
ParaNode =
CellNode.ChildNodes[mChild].ChildNodes[0];
}
if (ParaNode.Attributes["style"]
!= null)
{
string[] ParaStyleSub = ParaNode.Attributes["style"].Value.Split(';');
for (i = 0; i < ParaStyleSub.Length; i++)
{
if (ParaStyleSub[i].Trim().StartsWith("text-indent"))
continue;
if (ParaStyleSub[i].Trim().StartsWith("font") ||
ParaStyleSub[i].Trim().StartsWith("text")
|| ParaStyleSub[i].Trim().StartsWith("color:"))
style += ";" + ParaStyleSub[i].Trim() + ";";
}
}
if (ParaNode.ChildNodes.Count > 0)
{
spanNode = null;
spanStyleSub = null;
for (int spIndex = 0;
spIndex < ParaNode.ChildNodes.Count; spIndex++)
{
spanNode =
ParaNode.ChildNodes[spIndex];
if (spanNode.Name == "font")
{
ParaNode =
spanNode;
break;
}
if (spanNode.Name != "span"
&& spanNode.ChildNodes.Count > 0)
{
spanNode =
spanNode.ChildNodes[0];
}
if (spanNode.Name == "span")
//if (spanNode.Name == "span" &&
!StyleSet)
{
Content =
GetCleanText(spanNode.InnerText);
if
(Content != "")
{
if (spanNode.Attributes["style"]
!= null)
{
spanStyleSub = spanNode.Attributes["style"].Value.Split(';');
for (i = 0; i < spanStyleSub.Length; i++)
{
if (style.Contains(spanStyleSub[i].Trim()))
continue;
if (style.Contains("font-size:") &&
spanStyleSub[i].Trim().StartsWith("font-size:"))
continue;
if (spanNode.InnerText.Contains("#@super") ||
spanNode.InnerText.Contains("#@SUB"))
{
if (spanStyleSub[i].Trim().StartsWith("font-size"))
continue;
}
if (spanStyleSub[i].Trim().StartsWith("text-indent"))
continue;
if (spanStyleSub[i].Trim().StartsWith("font")
|| spanStyleSub[i].Trim().StartsWith("text")
|| spanStyleSub[i].Trim().StartsWith("color:"))
style += ";" +
spanStyleSub[i].Trim() + ";";
}
}
}
}
}
}
if (ParaNode.Name == "b"
|| CellNode.InnerHtml.Contains("</b>"))
{
style += ";font-weight:bold;";
}
else if (ParaNode.Name == "font")
{
if (ParaNode.Attributes["color"]
!= null)
{
if (ParaNode.Attributes["color"].Value
!= "")
style += ";color:" + ParaNode.Attributes["color"].Value + ";";
}
if (ParaNode.Attributes["style"]
!= null)
{
if (ParaNode.Attributes["style"].Value.ToLower().Replace(" ", "").Contains("font-weight:bold"))
{
style += ";font-weight:bold;";
}
if (ParaNode.Attributes["style"].Value.ToLower().Replace(" ", "").Contains("font-size:"))
{
spanStyleSub = ParaNode.Attributes["style"].Value.ToLower().Split(';');
for (i = 0; i < spanStyleSub.Length; i++)
{
if (spanStyleSub[i].Trim().StartsWith("font-size"))
{
style += ";" + spanStyleSub[i].Trim()
+ ";";
if (ParaNode.Attributes["size"]
!= null)
ParaNode.Attributes["size"].Remove();
break;
}
}
}
}
if (ParaNode.Attributes["size"]
!= null)
{
if
(ParaNode.Attributes["size"].Value
!= "")
{
try
{
int mSize = Convert.ToInt16(ParaNode.Attributes["size"].Value);
if (mSize < 8)
style += ";font-size:" +
fontSizeInPoints[mSize].ToString() + "pt;";
}
catch
{
}
}
}
}
}
}
if (style !=
string.Empty)
{
if
(NewHTMLtd.Attributes["style"] != null)
NewHTMLtd.Attributes["style"].Remove();
NewHTMLtd.Attributes.Add("style", style);
}
trNode.ReplaceChild(NewHTMLtd,
CellNode);
}
catch
(Exception ex)
{
}
return
trNode.InnerHtml;
}
9 ) static bool
isSubHeadRow(HtmlNode trNode, int FirstNumericColumnIndex, int
FirstNumericRowIndex, int mRowIndex)
{
try
{
int
tdindex = 0;
HtmlNode
tdNode = null;
string
Content = string.Empty;
bool
isAllCellBlank = true;
Content = trNode.InnerHtml.ToLower();
if
(!Content.Contains("colspan")) return false;
if
(mRowIndex < FirstNumericRowIndex) return false;
Content =
GetCleanText(trNode.InnerText);
if
(Content == "") return false;
for
(tdindex = FirstNumericColumnIndex; tdindex < trNode.ChildNodes.Count;
tdindex++)
{
tdNode =
trNode.ChildNodes[tdindex];
Content =
GetCleanText(tdNode.InnerText);
if
(Content != "")
{
isAllCellBlank = false;
if (CheckFinancialColumn(Content) || (Content.Length < 10
&& !IsStringAlpha(Content)))
{
if (tdNode.NextSibling != null)
{
if (tdNode.NextSibling.Attributes["colspan"] != null)
{
if (Convert.ToInt32(tdNode.NextSibling.Attributes["colspan"].Value) > 4)
continue;
}
}
return false;
}
}
}
if
(isAllCellBlank) return false;
}
catch
{
return
false;
}
return
true;
}
10) private static bool
IsStringAlpha(string str)
{
if
(str.Trim() != "")
{
Regex
r = new Regex(@"^[a-zA-Z()]*$");
return
r.IsMatch(str);
}
return
false;
}
11) static void
InsertCellForRowSpan(ref HtmlNode table, int
rowIndex, int cellIndex, int rowspan, HtmlNode
Maintdnode)
{
int
tdIndex = 0;
int
trIndex = 0;
int
mNormalizedCellCountIndex = 0;
int
mRecursiveRowSpan = 0;
try
{
foreach (HtmlNode trnode in
table.ChildNodes)
{
if
(trIndex > rowIndex && rowspan - 1 > 0)
{
tdIndex = 0;
if (trnode.ChildNodes.Count > 0)
{
mNormalizedCellCountIndex = GetNormalizedCellCountIndex(trnode);
foreach (HtmlNode
tdnode in trnode.ChildNodes)
{
if (tdnode.Attributes["rowspan"]
!= null)
{
if (tdnode.Attributes["rowspan"].Value
!= "1")
{
mRecursiveRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
InsertCellForRowSpan(ref table, trIndex,
tdIndex, mRecursiveRowSpan, tdnode);
tdnode.Attributes["rowspan"].Value
= "1";
tdnode.Attributes.Add("Rowspanremoved", "true");
tdnode.Attributes.Add("OriginalRowspan",
"" + mRecursiveRowSpan + "");
}
}
if (mNormalizedCellCountIndex < cellIndex ||
tdIndex == cellIndex)
{
HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\"
row-span-cell=\"true\" ></td>");
if (Maintdnode.Attributes["style"]
!= null)
{
newNode.Attributes["style"].Value
= Maintdnode.Attributes["style"].Value
+ ";" + newNode.Attributes["style"].Value;
if (!(newNode.Attributes["style"].Value.Trim().Contains("border-top-style:solid") &&
newNode.Attributes["style"].Value.Trim().Contains("border-bottom-style:solid")))
newNode.Attributes["style"].Value
= newNode.Attributes["style"].Value.Replace("border-top", "");
}
if (Maintdnode.Attributes["colspan"]
!= null)
newNode.Attributes.Add("colspan",
Maintdnode.Attributes["colspan"].Value);
if (mNormalizedCellCountIndex < cellIndex)
trnode.InsertAfter(newNode,
trnode.LastChild);
else
trnode.InsertBefore(newNode, tdnode);
if (rowspan < 1) return;
rowspan -= 1;
break;
}
if (tdnode.Attributes["colspan"]
!= null)
{
tdIndex += Convert.ToInt16(tdnode.Attributes["colspan"].Value);
}
else
{
tdIndex +=
1;
}
}
}
else
{
HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\"
row-span-cell=\"true\" ></td>");
if (Maintdnode.Attributes["style"]
!= null)
newNode.Attributes["style"].Value
= Maintdnode.Attributes["style"].Value
+ ";" + newNode.Attributes["style"].Value;
if (Maintdnode.Attributes["colspan"]
!= null)
newNode.Attributes.Add("colspan",
Maintdnode.Attributes["colspan"].Value);
trnode.AppendChild(newNode);
if (rowspan < 1) return;
rowspan -= 1;
}
}
else
if (rowspan - 1 == 0)
{
break;
}
trIndex += 1;
}
}
catch
(Exception ex)
{
}
}
12) static int
GetNormalizedCellCountIndex(HtmlNode trnode)
{
int
mCell = 0;
int
mCellCountSum = 0;
try
{
if
(trnode != null)
{
for
(mCell = 0; mCell < trnode.ChildNodes.Count; mCell++)
{
if
(trnode.ChildNodes[mCell].Attributes["colspan"]
!= null)
{
mCellCountSum += Convert.ToInt32(trnode.ChildNodes[mCell].Attributes["colspan"].Value);
}
else
{
mCellCountSum += 1;
}
}
return
mCellCountSum - 1;
}
}
catch
{
}
return
trnode.ChildNodes.Count - 1;
}
13) static string
GetCleanText(string Text)
{
return
Text.Replace(" ", "").Replace(" ",
"").Replace(" ", "").Replace("@#double#@", "").Replace("@#single#@", "").Replace("@#doubletop#@", "").Replace("@#singletop#@",
"").Replace("@#u#@", "").Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "");
}
14) static string
NormalizeTable_Simplified(HtmlNode table)
{
try
{
HtmlNode trnode = null,tdnode = null;
int trindex = 0, tdindex = 0, mRowSpan =
0, m = 0, mColSpan = 0;
for (trindex = 0; trindex <
table.ChildNodes.Count; trindex++)
{
trnode = table.ChildNodes[trindex];
for (tdindex = 0; tdindex <
trnode.ChildNodes.Count; tdindex++)
{
tdnode =
trnode.ChildNodes[tdindex];
mRowSpan = 0; mColSpan
= 0;
#region For rowspan
if (tdnode.Attributes["rowspan"]
!= null)
{
if (tdnode.Attributes["rowspan"].Value
!= "1")
{
mRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
InsertCellForRowSpan(ref table, trindex, tdindex, mRowSpan, tdnode);
tdnode.Attributes["rowspan"].Value
= "1";
tdnode.Attributes.Add("OriginalRowspan",
"" + mRowSpan + "");
}
}
#endregion
#region For colspan
if (tdnode.Attributes["colspan"]
!= null && tdnode.Attributes["colspan"].Value != "1")
{
mColSpan = Convert.ToInt32(tdnode.Attributes["colspan"].Value);
for (m = 0; m < mColSpan - 1; m++)
{
HtmlNode
newNode = HtmlNode.CreateNode("<td></td>");
trnode.InsertAfter(newNode, tdnode);
}
tdnode.Attributes["colspan"].Value = "1";
tdnode.Attributes.Add("OriginalColspan",
"" + mColSpan + "");
}
#endregion
}
}
}
catch (Exception
ex)
{
}
return table.InnerHtml;
}