Friday 27 September 2013

C#.Net code to normalize nested table with the help of HtmlAgility Pack


C#.Net Code to Normalize Nested Table with the help of HTmlAgilityPack 

Description: Nested table will result in Separate multiple tables without any nesting after using ProcessNestedTables() OR ProcessIndividualNestedTable() function given in post.

1)   ProcessNestedTables() : Using given method all nested tables present in html document can be normalized. Need to pass Html document object created using HtmlAgilityPack

2)   ProcessIndvidualNestedTable():Using given method individual nested table can be normalized. Need to pass individual nested table object as string


   Before Use (Output)

A
The ABC
First Table
Second Table
/////text
Third Table
/////text
/////text
/////text

   After Use (Output)

A
The ABC
First Table
/////text
/////text

Second Table
/////text

Third Table
/////text


      Methods to Use:

1)  For Complete Html Document:

static HtmlAgilityPack.HtmlDocument ProcessNestedTables(ref HtmlAgilityPack.HtmlDocument objHTMLdoc)
{
    try
    {
        HtmlAgilityPack.HtmlNodeCollection tableList = objHTMLdoc.DocumentNode.SelectNodes("//table");
        if (tableList != null)
        {
            foreach (HtmlAgilityPack.HtmlNode table in tableList)
            {
                #region formatting of nested tables by Shrikant
                HtmlAgilityPack.HtmlNode ParentNode = table.ParentNode;
                if (ParentNode != null)
                {
                    int NestedTableCount = 1;
                    HtmlAgilityPack.HtmlNode HTMLNode = HtmlAgilityPack.HtmlNode.CreateNode("<div></div>");
                    HtmlAgilityPack.HtmlNode htmlbreak = HtmlAgilityPack.HtmlNode.CreateNode("<br>");
                    HTMLNode.AppendChild(htmlbreak);
                    GenerateTablesFromNestedTables(table, ref HTMLNode, ref NestedTableCount);
                    if (HTMLNode.InnerHtml.Contains("<table"))
                        ParentNode.InsertAfter(HTMLNode, table);
                }
                #endregion
            }
        }
    }
    catch (Exception ex)
    {
    }
    return objHTMLdoc;//OutPut
}

2)       For Individual Html Table:

public string ProcessIndividualNestedTable(string tableHtml)
{
    HtmlAgilityPack.HtmlDocument objHTMLdoc = new HtmlAgilityPack.HtmlDocument();
    try
    {
               
        divString = HttpUtility.UrlDecode(tableHtml).Replace("  """).Replace("\t""").Replace("\n""").Replace("\r""");
        objHTMLdoc.LoadHtml(tableHtml);
        objHTMLdoc = ProcessNestedTables(ref objHTMLdoc);              
    }
    catch (Exception ex)
    {}
    return objHTMLdoc.DocumentNode.OuterHtml; //Output
}


Supporting methods used inside for processing:

1)    static void GenerateTablesFromNestedTables(HtmlNode table, ref HtmlNode HTMLNode, ref int NestedLevel)
        {
            try
            {
                HtmlNode invalidInnerTable;
                if (table.ChildNodes != null && table.ChildNodes.Count > 0)
                {
                    invalidInnerTable = table.ChildNodes[0];
                    if (invalidInnerTable.Name == "table")
                    {
                        table.InnerHtml = invalidInnerTable.InnerHtml;
                    }
                }

                for (int trchild = 0; trchild < table.ChildNodes.Count; trchild++)
                {
                    HtmlNode trElement = table.ChildNodes[trchild];
                    for (int tdchild = 0; tdchild < trElement.ChildNodes.Count; tdchild++)
                    {
                        HtmlNode tdElement = trElement.ChildNodes[tdchild];
                        HtmlNode ChildTableNode = GetTableObject(ref tdElement);
                        if (ChildTableNode != null)
                        {
                            GenerateTablesFromNestedTables(ChildTableNode, ref HTMLNode, ref NestedLevel); //Recursive calling is involved here

                            HtmlNode FormattedChildTableNode = HtmlNode.CreateNode("<div><span>T-" + NestedLevel.ToString() + "</span>" + ChildTableNode.OuterHtml + "</div>");
                            HtmlNode htmlbreak = HtmlNode.CreateNode("<br>");
                            FormattedChildTableNode.AppendChild(htmlbreak);

                            if (HTMLNode.ChildNodes.Count > 0)
                            {
                                HTMLNode.InsertBefore(FormattedChildTableNode, HTMLNode.LastChild);
                            }
                            else
                            {
                                HTMLNode.AppendChild(FormattedChildTableNode);
                            }
                            HtmlNode htmlNestedMapping = HtmlNode.CreateNode("<span>T-" + NestedLevel.ToString() + "</span>");
                            NestedLevel += 1;
                            HtmlNode ParentChildTableNode = ChildTableNode.ParentNode;
                            if (ParentChildTableNode != null)
                            {
                                ParentChildTableNode.ReplaceChild(htmlNestedMapping, ChildTableNode);
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }

2)    static HtmlNode GetTableObject(ref HtmlNode tdNode)
        {
            try
            {
                if (tdNode.ChildNodes.Count > 0)
                {
                    for (int child = 0; child < tdNode.ChildNodes.Count; child++)
                    {
                        if (tdNode.ChildNodes[child].Name == "table")
                        {
                            return tdNode.ChildNodes[child];
                        }
                        else if (tdNode.ChildNodes[child].InnerHtml.Contains("<table"))
                        {
                            HtmlNode ChildNode = tdNode.ChildNodes[child];
                            return GetTableObject(ref ChildNode);
                        }
                    }
                }
                else if (tdNode.Name == "table")
                {
                    return tdNode;
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
            return null;
        }





How to Normalize (remove rowspan & colspan) a Financial HTML Table



How to Normalize (remove rowspan & colspan) a Financial HTML Table:

Description: Convert Html Financial Table to Its normalized format taking care of header, Stub and financial cell by marking them.

Header: Header of financial table (marked with head=”true”)
Stub: First Non-financial column of financial table (marked with stub=”true”)
Subhead: All Cells those are above FirstNumericRowIndex (marked with stub=”true”)
 Or in some case below FirstNumericRowIndex occupying complete row
FirstNumericRowIndex: First Row Index that has financial value.
FirstNumericColumnIndex: First Col Index that has financial value.

Tools Used: Html Agility pack

Diagrammatic Representation:

a)    Before:
b)     
Side
Company Abc
2012
2013
Assets
$5,000
6000
ABC Assets
USD300(1)
$4,000x

In Between SubHead
Debts
2000
$10000

b) After Normalization:

Side
Company Abc
2012
2013
Assets
$5,000
6000
ABC Assets
USD300(1)
$4,000x

In Between SubHead

Debts
2000
$10000

Main Calling method:
1)    FormatAllTable: start point for normalization. Take input as html table string.
2)    NormalizeTable: Do the process of normalization.
3)    InsertCellForRowSpan(): To create cells for normalization.

Supporting methods:
1)    GetCleanText() : To get Clear content
2)    CheckFinancialColumn() : To Check Financial Column
3)    GenerateFinancialCellSuperSubScript(): Identify superscript/subscript footnote in financial cell & mark them.(Plz comment call to this if have no use in your code)
4)    GenerateFinancialCell(): To Clean inner html of financial cell adding styles to Cell.
5)    SetFirstRowAndColIndex(): To Identify First Row & Col index for Table
6)    GetNormalizedCellCountIndex() To get proper index of a cell excluding colspan.
7)    isSubHeadRow() : To identify if a table has in between subhead along with header sub head
8)    IsStringAlpha(): to identify alphabets string
9)    VerifyCurrency(): To Check if given column is currency column.
10) CheckNumeric(): To Identify numeric columns



Way of Calling:
1)        String strTable=”table html content”;
2)        Call FormatAllTable method as given below.
        Eg. strTable =FormatAllTable(strTable);


Method Definitions:

1)     public string FormatAllTable(string tableHtml)
        {
            HtmlAgilityPack.HtmlDocument objHTMLdoc = new HtmlAgilityPack.HtmlDocument();
            try
            {
                int FirstNumericRowIndex = -1, FirstNumericColumnIndex = -1;
                tableHtml = tableHtml.Replace("  ", "").Replace("\t", "").Replace("\n", "").Replace("\r", "");
                objHTMLdoc.LoadHtml(tableHtml);

                 HtmlNodeCollection tableList = objHTMLdoc.DocumentNode.SelectNodes("//table");
                        if (tableList != null)
                        {
                            for (int tableIndex = 0; tableIndex < tableList.Count(); tableIndex++)
                            {
                                try
                                {
                                    HtmlNode  tableCopy = tableList[tableIndex];
                                    bool hasSubHead=false;
                                    SetFirstRowAndColIndex(tableCopy, ref FirstNumericRowIndex, ref FirstNumericColumnIndex);
                                    tableCopy.InnerHtml = NormalizeTable(tableCopy, FirstNumericRowIndex, FirstNumericColumnIndex, ref hasSubHead);

                                }
            catch (Exception ex)
            {}
                            }
                        }
            }
            catch (Exception ex)
            {}
            return objHTMLdoc.DocumentNode.OuterHtml; //OutPut
        }

2)     static string NormalizeTable(HtmlNode table, int FirstNumericRowIndex, int FirstNumericColumnIndex, ref bool isSubHeadFound)
        {
            try
            {
                int colCnt = table.FirstChild.ChildNodes.Count;
                //int isHeadInProcessRow = 7;
                HtmlNode trnode = null;
                HtmlNode tdnode = null;
                bool isStubDone = false;
                string content = string.Empty;
                //string TempContent = string.Empty;
                //string TempSymbol = "";

                int trindex = 0, tdindex = 0, mRowSpan = 0, m = 0, mColSpan = 0;
                for (trindex = 0; trindex < table.ChildNodes.Count; trindex++)
                {
                    trnode = table.ChildNodes[trindex];
                    if (trnode.Attributes["subheadrow"] == null)
                    {
                        if (isSubHeadRow(trnode, FirstNumericColumnIndex, FirstNumericRowIndex, trindex))
                        {
                            trnode.Attributes.Add("subheadrow", "true");
                            isSubHeadFound = true;
                        }
                    }

                    isStubDone = false;
                    for (tdindex = 0; tdindex < trnode.ChildNodes.Count; tdindex++)
                    {
                        tdnode = trnode.ChildNodes[tdindex];
                        content = GetCleanText(tdnode.InnerText);//.Replace("&#160;", "").Replace("&nbsp;", "").Replace("&#xa0;", "").Replace(" ", "");
                        mRowSpan = 0; mColSpan = 0;

                        if (tdnode.Attributes["rowspan"] != null)
                        {
                            if (tdnode.Attributes["rowspan"].Value != "1")
                            {
                                mRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
                                InsertCellForRowSpan(ref table, trindex, tdindex, mRowSpan, tdnode);
                                tdnode.Attributes["rowspan"].Value = "1";
                                tdnode.Attributes.Add("Rowspanremoved", "true");
                                tdnode.Attributes.Add("OriginalRowspan", "" + mRowSpan + "");
                            }
                        }
                        if (tdnode.Attributes["colspan"] != null && tdnode.Attributes["colspan"].Value != "1")
                        {
                            mColSpan = Convert.ToInt32(tdnode.Attributes["colspan"].Value);
                            for (m = 0; m < mColSpan - 1; m++)
                            {
                                HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\" data-append='true'></td>");
                                if (tdnode.Attributes["style"] != null)
                                    newNode.Attributes["style"].Value = tdnode.Attributes["style"].Value + ";" + newNode.Attributes["style"].Value;

                                trnode.InsertAfter(newNode, tdnode);
                            }
                            tdnode.Attributes["colspan"].Value = "1";
                            tdnode.Attributes.Add("Colspanremoved", "true");
                            tdnode.Attributes.Add("OriginalColspan", "" + mColSpan + "");
                        }
                        if (content != "" || tdindex == FirstNumericColumnIndex - 1)
                        {
                            if (tdindex < FirstNumericColumnIndex) //tdindex < 3 && table.Attributes["excel"] == null
                            {
                                if (!CheckFinancialColumn(content) && !isStubDone)//&& content.Length > 0 (For bug)
                                {
                                    if (mColSpan > trnode.ChildNodes.Count + 1 / 2 && trindex < FirstNumericRowIndex)
                                        tdnode.Attributes.Add("head", "true");
                                    else
                                    {
                                        tdnode.Attributes.Add("stub", "true");
                                        if ((trindex == FirstNumericRowIndex - 1 || trindex == FirstNumericRowIndex - 2) && content == GetCleanText(trnode.InnerText))
                                        {
                                            if (trnode.Attributes["BlankStub"] == null)
                                                trnode.Attributes.Add("BlankStub", "true");
                                        }
                                    }
                                    isStubDone = true;
                                }
                            }
                        }

                        if (tdnode.ChildNodes.Count > 0)
                        {
                            if (trindex < FirstNumericRowIndex && tdindex >= FirstNumericColumnIndex)
                            {
                                tdnode.Attributes.Add("head", "true");
                            }
                            else
                            {
                                if (!CheckFinancialColumn(content) && content.Length > 4 && isStubDone)
                                {
                                    tdnode.Attributes.Add("subhead", "true");
                                }
                                else if (trindex < FirstNumericRowIndex && isStubDone)
                                {
                                    tdnode.Attributes.Add("subhead", "true");
                                }
                            }
                            if (trindex > 0 && tdindex > 0 && content.Length < 15 && content != "")
                            {
                                if (CheckFinancialColumn(content) || content == "")
                                {
                                    tdnode.InnerHtml = GenerateFinancialCellSuperSubScript(tdnode);
                                }
                            }
                            trnode.InnerHtml = GenerateFinancialCell(tdnode, trnode);
                        }
                    }
                }

            }
            catch (Exception ex)
            {
            }
            return table.InnerHtml;
        }



3) static void SetFirstRowAndColIndex(HtmlNode table, ref int FirstNumericRowIndex, ref int FirstNumericColumnIndex)
        {
            int i = 0, row = 0;
            int columnCount = GetHighestColumnCount(table);
            FirstNumericRowIndex = -1;
            string Content = string.Empty;
            try
            {
                #region Set column Index By Shrikant
                for (i = 1; i < columnCount; i++)
                {
                    for (row = 0; row < table.ChildNodes.Count; row++)
                    {
                        if (i < table.ChildNodes[row].ChildNodes.Count)
                        {
                            Content = GetCleanText(table.ChildNodes[row].ChildNodes[i].InnerText);
                            if (CheckFinancialColumn(Content))
                            {
                                if (table.ChildNodes[row].ChildNodes[i].NextSibling != null)
                                {
                                    Content = GetCleanText(table.ChildNodes[row].ChildNodes[i].NextSibling.InnerText);
                                    if (Content.Length > 6 || Content.Length == 1)
                                    {
                                        if (!CheckNumeric(Content) && !VerifyGutterSymbol(Content, "=="))
                                            continue;
                                    }
                                }
                                FirstNumericColumnIndex = i;
                                break;
                            }
                        }
                    }
                    if (FirstNumericColumnIndex > 0)
                        break;
                }
                #endregion

                #region Set Row Index By Shrikant
                for (row = 0; row < table.ChildNodes.Count; row++)
                {
                    for (i = FirstNumericColumnIndex; i < columnCount; i++)
                    {
                        if (i < table.ChildNodes[row].ChildNodes.Count)
                        {
                            if (CheckFinancialColumn(table.ChildNodes[row].ChildNodes[i].InnerText))
                            {
                                FirstNumericRowIndex = row;
                                break;
                            }
                        }
                    }
                    if (FirstNumericRowIndex > -1)
                        break;
                }
                #endregion
            }
            catch (Exception ex)
            {
            }
        }


4)     static bool CheckFinancialColumn(string CellContent)
        {
            try
            {
                CellContent = GetCleanText(CellContent);
                if (VerifyCurrency(CellContent, "Starts"))
                {
                    CellContent = CellContent.ToLower().Replace("chf", "$").Replace("rs", "$").Replace("rs", "$").Replace("&#151;", "—").Replace("&#8212;", "—");
                    if (CellContent.Length > 4)
                    {
                        if (CheckNumeric(CellContent.Substring(1, 3)))
                            return true;
                    }
                    else
                    {
                        return true;
                    }
                }
                else if (isYear(CellContent))
                {
                    return false;
                }
                else if (CellContent.Replace("(", "").Replace(")", "").Replace("[", "").Replace("]", "").Replace("{", "").Replace("}", "").Trim().Length == 1 && (CellContent.Contains("(") || CellContent.Contains("[") || CellContent.Contains("{")))
                {
                    return false;
                }
                return CheckNumeric(CellContent);
            }
            catch
            {
            }
            return false;
        }



5)     static bool VerifyCurrency(string CellContent, string sType)
        {
            string CleanData = string.Empty;
            try
            {
                if (CellContent != "" && CellContent != "$")
                {
                    for (int i = 0; i < CurrencySymbols.Length; i++)
                    {
                        CellContent = CellContent.Replace(CurrencySymbols[i], "$");
                    }
                }
                CleanData = CellContent.Replace("(", "").Replace(")", "").Replace(",", "").Replace("[", "").Replace("]", "").Replace("%", "").Replace("#@super#@", "").Replace("#@sub#@", "");
                if (sType == "==")
                {
                    if (CellContent == "$")
                        return true;
                }
                else
                {
                    if (CellContent.StartsWith("$") && CleanData.Replace("XX", "").Length < 12)
                        return true;
                    else
                    {
                        if (CellContent.StartsWith("$"))
                        {
                            CleanData = CleanData.Replace("$", "");
                            return CheckNumeric(CleanData);
                        }
                    }
                }
                return false;
            }
            catch
            {
                return false;
            }
        }


6)     static bool CheckNumeric(string CellContent)
        {
            try
            {
                CellContent = GetCleanText(CellContent);
                CellContent = CellContent.Replace("(", "").Replace(")", "").Replace("––", "–").Replace("%", "").Replace("--", "-").Replace(".", "").Replace("[", "").Replace("]", "").Replace(".", "").Replace("**", "*").Replace(",", "").Replace("years", "").Replace("year", "");
                CellContent = CellContent.Trim();
                if (CellContent == "*" || CellContent == "-" || CellContent == "--" || CellContent == "—" || CellContent == "&#151;" || CellContent == "&#8212;" || CellContent.ToLower().StartsWith("xx") || CellContent.ToLower() == "n/a" || CellContent.ToLower() == "nm")
                    return true;

                CellContent = CellContent.Replace("–", "").Replace("-", "").Replace("+", "").Replace("#", "").Replace("@", "");

                if (CellContent.Length > 1 && CellContent.Length < 16)
                {
                    if (VerifyCurrency(CellContent, "Starts"))
                        CellContent = CellContent.Substring(1, CellContent.Length - 1);
                }

                if (CellContent.Length > 1 && CellContent.Length < 16)
                {
                    if (CellContent.IndexOf("(") > 2)
                    {
                        CellContent = CellContent.Substring(0, CellContent.IndexOf("("));
                    }
                }

                if (CellContent.Length > 0 && CellContent.Length < 60)
                {
                    Regex textCheck = new Regex(@"^[0-9]*$", RegexOptions.None);
                    Match b = textCheck.Match(CellContent);
                    return b.Success;
                }
            }
            catch
            {
            }
            return false;
        }


     7)   static string GenerateFinancialCellSuperSubScript(HtmlNode CellNode)
        {
            try
            {
                string styleList = null;
                string[] styleLists = null;
                string fontSize = string.Empty;
                int i = 0, j = 0;
                if ((CellNode.ChildNodes.Count > 0 && CellNode.ChildNodes.Count < 2) && CellNode.Attributes["head"] == null && CellNode.Attributes["stub"] == null)
                {
                    if (CellNode.ChildNodes[0].Name == "p")
                    {
                        if (CellNode.ChildNodes[0].ChildNodes.Count > 0)
                        {
                            for (i = 0; i < CellNode.ChildNodes[0].ChildNodes.Count; i++)
                            {
                                fontSize = string.Empty;
                                if (CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:sub") || CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:super"))
                                {
                                    if (CellNode.ChildNodes[0].ChildNodes[i].Attributes["style"] != null)
                                    {
                                        styleList = CellNode.ChildNodes[0].ChildNodes[i].Attributes["style"].Value;
                                        styleLists = styleList.Split(';');
                                        for (j = 0; j < styleLists.Length; j++)
                                        {
                                            if (styleLists[j].Trim().StartsWith("font-size"))
                                            {
                                                fontSize = styleLists[j].Trim().Replace("font-size", "").Replace(":", "");
                                                if (CellNode.Attributes["SuperSubFontSize"] == null && fontSize != "")
                                                    CellNode.Attributes.Add("SuperSubFontSize", fontSize);
                                                break;
                                            }
                                        }
                                    }
                                }
                                if (CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:sub"))
                                {
                                    if (CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count > 1)
                                    {
                                        for (int s = 0; s < CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count; s++)
                                        {
                                            if (CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].OuterHtml.Contains("vertical-align:sub"))
                                            {
                                                CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml = CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml.Replace(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText, "#@sub#@" + CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText);
                                                break;
                                            }
                                        }
                                    }
                                    else
                                    {
                                        if (GetCleanText(CellNode.ChildNodes[0].ChildNodes[i].InnerText) != "")
                                        {  
                                            if (CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:sub"))
                                            {                                        
                                                CellNode.ChildNodes[0].ChildNodes[i].InnerHtml = "#@sub#@" + CellNode.ChildNodes[0].ChildNodes[i].InnerText;
                                                break;
                                            }
                                        }
                                    }
                                }
                                if (CellNode.ChildNodes[0].ChildNodes[i].OuterHtml.Contains("vertical-align:super"))
                                {

                                    if (CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count > 1)
                                    {
                                        for (int s = 0; s < CellNode.ChildNodes[0].ChildNodes[i].ChildNodes.Count; s++)
                                        {
                                            if (CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].OuterHtml.Contains("vertical-align:super"))
                                            {  
                                                CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml = CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerHtml.Replace(CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText, "#@super#@" + CellNode.ChildNodes[0].ChildNodes[i].ChildNodes[s].InnerText);
                                                break;                                            
                                            }
                                        }
                                    }
                                    else
                                    {
                                        if (GetCleanText(CellNode.ChildNodes[0].ChildNodes[i].InnerText) != "")
                                        {
                                            CellNode.ChildNodes[0].ChildNodes[i].InnerHtml = CellNode.ChildNodes[0].ChildNodes[i].InnerHtml.Replace(CellNode.ChildNodes[0].ChildNodes[i].InnerText, "#@super#@" + CellNode.ChildNodes[0].ChildNodes[i].InnerText);                                          
                                            break;
                                        }

                                    }
                                }
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
            }
            return CellNode.InnerHtml;
        }




    8)    static string GenerateFinancialCell(HtmlNode CellNode, HtmlNode trNode)
        {
            try
            {
                HtmlNode NewHTMLtd = null;
                string style = string.Empty;
                string Content = string.Empty;
                string mAppendType = string.Empty;
                HtmlNode ParaNode = null;
                HtmlNode spanNode = null;
                string[] spanStyleSub = null;
                int i = 0;
                int mChild = 0;
                mAppendType = (CellNode.Attributes["append-type"] != null) ? CellNode.Attributes["append-type"].Value : "content";

                if (CellNode.Attributes["head"] != null || CellNode.Attributes["stub"] != null || CellNode.Attributes["subhead"] != null)
                    NewHTMLtd = HtmlNode.CreateNode("<td  data-append=\"true\" append-type='" + mAppendType + "'>" + CellNode.InnerHtml + "</td>");
                else
                    NewHTMLtd = HtmlNode.CreateNode("<td  data-append=\"true\" append-type='" + mAppendType + "'>" + CellNode.InnerText + "</td>");

                foreach (HtmlAttribute atr in CellNode.Attributes)
                {
                    var found = NewHTMLtd.Attributes.Where(x => x.Name.ToLower() == atr.Name.ToLower());
                    if (found.Count() == 0)
                        NewHTMLtd.Attributes.Add(atr);
                }
                if (NewHTMLtd.Attributes["style"] != null)
                    style = NewHTMLtd.Attributes["style"].Value;

                if (CellNode.Attributes["head"] == null && CellNode.Attributes["stub"] == null && CellNode.Attributes["subhead"] == null)
                {
                    if (CellNode.ChildNodes.Count > 0)
                    { 
                        for (mChild = CellNode.ChildNodes.Count - 1; mChild > -1; mChild--)
                        {
                            if (GetCleanText(CellNode.ChildNodes[mChild].InnerText) != "") break;
                        }
                        if (mChild == CellNode.ChildNodes.Count || mChild < 0) mChild = 0;

                        ParaNode = CellNode.ChildNodes[mChild];

                        if (ParaNode.Name == "div")
                        {
                            if (CellNode.ChildNodes[mChild].HasChildNodes)
                                ParaNode = CellNode.ChildNodes[mChild].ChildNodes[0];
                        }


                        if (ParaNode.Attributes["style"] != null)
                        {
                            string[] ParaStyleSub = ParaNode.Attributes["style"].Value.Split(';');
                            for (i = 0; i < ParaStyleSub.Length; i++)
                            {
                                if (ParaStyleSub[i].Trim().StartsWith("text-indent"))
                                    continue;

                                if (ParaStyleSub[i].Trim().StartsWith("font") || ParaStyleSub[i].Trim().StartsWith("text") || ParaStyleSub[i].Trim().StartsWith("color:"))
                                    style += ";" + ParaStyleSub[i].Trim() + ";";
                            }
                        }
                        if (ParaNode.ChildNodes.Count > 0)
                        { 
                            spanNode = null;
                            spanStyleSub = null;
                            for (int spIndex = 0; spIndex < ParaNode.ChildNodes.Count; spIndex++)
                            {
                                spanNode = ParaNode.ChildNodes[spIndex];
                                if (spanNode.Name == "font")
                                {
                                    ParaNode = spanNode;
                                    break;
                                }
                                if (spanNode.Name != "span" && spanNode.ChildNodes.Count > 0)
                                {
                                    spanNode = spanNode.ChildNodes[0];
                                }
                                if (spanNode.Name == "span") //if (spanNode.Name == "span" && !StyleSet)
                                {
                                    Content = GetCleanText(spanNode.InnerText);
                                    if (Content != "")
                                    {
                                        if (spanNode.Attributes["style"] != null)
                                        {
                                            spanStyleSub = spanNode.Attributes["style"].Value.Split(';');
                                            for (i = 0; i < spanStyleSub.Length; i++)
                                            {
                                                if (style.Contains(spanStyleSub[i].Trim())) continue;

                                                if (style.Contains("font-size:") && spanStyleSub[i].Trim().StartsWith("font-size:")) continue;

                                                if (spanNode.InnerText.Contains("#@super") || spanNode.InnerText.Contains("#@SUB"))
                                                {
                                                    if (spanStyleSub[i].Trim().StartsWith("font-size"))
                                                        continue;
                                                }

                                                if (spanStyleSub[i].Trim().StartsWith("text-indent"))
                                                    continue;

                                                if (spanStyleSub[i].Trim().StartsWith("font") || spanStyleSub[i].Trim().StartsWith("text") || spanStyleSub[i].Trim().StartsWith("color:"))
                                                    style += ";" + spanStyleSub[i].Trim() + ";";
                                            }
                                        }                                      
                                    }
                                }
                            }
                        }


                        if (ParaNode.Name == "b" || CellNode.InnerHtml.Contains("</b>"))
                        {
                            style += ";font-weight:bold;";
                        }
                        else if (ParaNode.Name == "font")
                        {
                            if (ParaNode.Attributes["color"] != null)
                            {
                                if (ParaNode.Attributes["color"].Value != "")
                                    style += ";color:" + ParaNode.Attributes["color"].Value + ";";
                            }

                            if (ParaNode.Attributes["style"] != null)
                            {
                                if (ParaNode.Attributes["style"].Value.ToLower().Replace(" ", "").Contains("font-weight:bold"))
                                {
                                    style += ";font-weight:bold;";
                                }
                                if (ParaNode.Attributes["style"].Value.ToLower().Replace(" ", "").Contains("font-size:"))
                                {
                                    spanStyleSub = ParaNode.Attributes["style"].Value.ToLower().Split(';');
                                    for (i = 0; i < spanStyleSub.Length; i++)
                                    {
                                        if (spanStyleSub[i].Trim().StartsWith("font-size"))
                                        {
                                            style += ";" + spanStyleSub[i].Trim() + ";";
                                            if (ParaNode.Attributes["size"] != null)
                                                ParaNode.Attributes["size"].Remove();
                                            break;
                                        }
                                    }
                                }
                            }
                            if (ParaNode.Attributes["size"] != null)
                            {
                                if (ParaNode.Attributes["size"].Value != "")
                                {
                                    try
                                    {
                                        int mSize = Convert.ToInt16(ParaNode.Attributes["size"].Value);
                                        if (mSize < 8)
                                            style += ";font-size:" + fontSizeInPoints[mSize].ToString() + "pt;";
                                    }
                                    catch
                                    {
                                    }
                                }
                            }
                        }
                    }
                }
                if (style != string.Empty)
                {
                    if (NewHTMLtd.Attributes["style"] != null)
                        NewHTMLtd.Attributes["style"].Remove();

                    NewHTMLtd.Attributes.Add("style", style);
                }
                trNode.ReplaceChild(NewHTMLtd, CellNode);
            }
            catch (Exception ex)
            {
            }
            return trNode.InnerHtml;
        }




9 ) static bool isSubHeadRow(HtmlNode trNode, int FirstNumericColumnIndex, int FirstNumericRowIndex, int mRowIndex)
        {
            try
            {
                int tdindex = 0;
                HtmlNode tdNode = null;
                string Content = string.Empty;
                bool isAllCellBlank = true;
                Content = trNode.InnerHtml.ToLower();
                if (!Content.Contains("colspan")) return false;
                if (mRowIndex < FirstNumericRowIndex) return false;
                Content = GetCleanText(trNode.InnerText);
                if (Content == "") return false;
                for (tdindex = FirstNumericColumnIndex; tdindex < trNode.ChildNodes.Count; tdindex++)
                {
                    tdNode = trNode.ChildNodes[tdindex];
                    Content = GetCleanText(tdNode.InnerText);
                    if (Content != "")
                    {
                        isAllCellBlank = false;
                        if (CheckFinancialColumn(Content) || (Content.Length < 10 && !IsStringAlpha(Content)))
                        {
                            if (tdNode.NextSibling != null)
                            {
                                if (tdNode.NextSibling.Attributes["colspan"] != null)
                                {
                                    if (Convert.ToInt32(tdNode.NextSibling.Attributes["colspan"].Value) > 4)
                                        continue;
                                }
                            }
                            return false;
                        }
                    }
                }
                if (isAllCellBlank) return false;
            }
            catch
            {
                return false;
            }
            return true;
        }

10)  private static bool IsStringAlpha(string str)
        {
            if (str.Trim() != "")
            {
                Regex r = new Regex(@"^[a-zA-Z()]*$");
                return r.IsMatch(str);
            }
            return false;
        }
11) static void InsertCellForRowSpan(ref HtmlNode table, int rowIndex, int cellIndex, int rowspan, HtmlNode Maintdnode)
        {
            int tdIndex = 0;
            int trIndex = 0;
            int mNormalizedCellCountIndex = 0;
            int mRecursiveRowSpan = 0;
            try
            {
                foreach (HtmlNode trnode in table.ChildNodes)
                {
                    if (trIndex > rowIndex && rowspan - 1 > 0)
                    {
                        tdIndex = 0;

                        if (trnode.ChildNodes.Count > 0)
                        {
                            mNormalizedCellCountIndex = GetNormalizedCellCountIndex(trnode);
                            foreach (HtmlNode tdnode in trnode.ChildNodes)
                            {
                                if (tdnode.Attributes["rowspan"] != null)
                                {
                                    if (tdnode.Attributes["rowspan"].Value != "1")
                                    {
                                        mRecursiveRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
                                        InsertCellForRowSpan(ref table, trIndex, tdIndex, mRecursiveRowSpan, tdnode);
                                        tdnode.Attributes["rowspan"].Value = "1";
                                        tdnode.Attributes.Add("Rowspanremoved", "true");
                                        tdnode.Attributes.Add("OriginalRowspan", "" + mRecursiveRowSpan + "");
                                    }
                                }
                                if (mNormalizedCellCountIndex < cellIndex || tdIndex == cellIndex)
                                {
                                    HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\" row-span-cell=\"true\" ></td>");
                                    if (Maintdnode.Attributes["style"] != null)
                                    {
                                        newNode.Attributes["style"].Value = Maintdnode.Attributes["style"].Value + ";" + newNode.Attributes["style"].Value;
                                        if (!(newNode.Attributes["style"].Value.Trim().Contains("border-top-style:solid") && newNode.Attributes["style"].Value.Trim().Contains("border-bottom-style:solid")))
                                            newNode.Attributes["style"].Value = newNode.Attributes["style"].Value.Replace("border-top", "");
                                    }

                                    if (Maintdnode.Attributes["colspan"] != null)
                                        newNode.Attributes.Add("colspan", Maintdnode.Attributes["colspan"].Value);
                                    if (mNormalizedCellCountIndex < cellIndex)
                                        trnode.InsertAfter(newNode, trnode.LastChild);
                                    else
                                        trnode.InsertBefore(newNode, tdnode);

                                    if (rowspan < 1) return;
                                    rowspan -= 1;
                                    break;
                                }
                                if (tdnode.Attributes["colspan"] != null)
                                {
                                    tdIndex += Convert.ToInt16(tdnode.Attributes["colspan"].Value);
                                }
                                else
                                {
                                    tdIndex += 1;
                                }

                            }
                        }
                        else
                        {
                            HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\" row-span-cell=\"true\" ></td>");
                            if (Maintdnode.Attributes["style"] != null)
                                newNode.Attributes["style"].Value = Maintdnode.Attributes["style"].Value + ";" + newNode.Attributes["style"].Value;

                            if (Maintdnode.Attributes["colspan"] != null)
                                newNode.Attributes.Add("colspan", Maintdnode.Attributes["colspan"].Value);

                            trnode.AppendChild(newNode);

                            if (rowspan < 1) return;
                            rowspan -= 1;
                        }
                    }
                    else if (rowspan - 1 == 0)
                    {
                        break;
                    }
                    trIndex += 1;
                }
            }
            catch (Exception ex)
            {
            }
        }

12) static int GetNormalizedCellCountIndex(HtmlNode trnode)
        {
            int mCell = 0;
            int mCellCountSum = 0;
            try
            {
                if (trnode != null)
                {
                    for (mCell = 0; mCell < trnode.ChildNodes.Count; mCell++)
                    {
                        if (trnode.ChildNodes[mCell].Attributes["colspan"] != null)
                        {
                            mCellCountSum += Convert.ToInt32(trnode.ChildNodes[mCell].Attributes["colspan"].Value);
                        }
                        else
                        {
                            mCellCountSum += 1;
                        }
                    }
                    return mCellCountSum - 1;
                }
            }
            catch
            {
            }
            return trnode.ChildNodes.Count - 1;
        }
13) static string GetCleanText(string Text)
        {
            return Text.Replace("&#160;", "").Replace("&nbsp;", "").Replace("&#xa0;", "").Replace("@#double#@", "").Replace("@#single#@", "").Replace("@#doubletop#@", "").Replace("@#singletop#@", "").Replace("@#u#@", "").Replace(" ", "").Replace(" ", "").Replace("\r", "").Replace("\n", "");
        }

14) static string NormalizeTable_Simplified(HtmlNode table)
        {
            try
            {  
                HtmlNode trnode = null,tdnode = null;                         
                int trindex = 0, tdindex = 0, mRowSpan = 0, m = 0, mColSpan = 0;
                for (trindex = 0; trindex < table.ChildNodes.Count; trindex++)
                {
                    trnode = table.ChildNodes[trindex];
                   
                    for (tdindex = 0; tdindex < trnode.ChildNodes.Count; tdindex++)
                    {
                        tdnode = trnode.ChildNodes[tdindex];                      
                        mRowSpan = 0; mColSpan = 0;
                        #region For rowspan
                        if (tdnode.Attributes["rowspan"] != null)
                        {
                            if (tdnode.Attributes["rowspan"].Value != "1")
                            {
                                mRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
                                InsertCellForRowSpan(ref table, trindex, tdindex, mRowSpan, tdnode);
                                tdnode.Attributes["rowspan"].Value = "1";                               
                                tdnode.Attributes.Add("OriginalRowspan", "" + mRowSpan + "");
                            }
                        }
                        #endregion
                        #region For colspan
                        if (tdnode.Attributes["colspan"] != null && tdnode.Attributes["colspan"].Value != "1")
                        {
                            mColSpan = Convert.ToInt32(tdnode.Attributes["colspan"].Value);
                            for (m = 0; m < mColSpan - 1; m++)
                            {
                                HtmlNode newNode = HtmlNode.CreateNode("<td></td>");
                                trnode.InsertAfter(newNode, tdnode);
                            }
                            tdnode.Attributes["colspan"].Value = "1";                           
                            tdnode.Attributes.Add("OriginalColspan", "" + mColSpan + "");
                        }
                        #endregion
                    }
                }

            }
            catch (Exception ex)
            { }
            return table.InnerHtml;
        }