Friday, 27 September 2013

C#.Net code to normalize nested table with the help of HtmlAgility Pack


C#.Net Code to Normalize Nested Table with the help of HTmlAgilityPack 

Description: Nested table will result in Separate multiple tables without any nesting after using ProcessNestedTables() OR ProcessIndividualNestedTable() function given in post.

1)   ProcessNestedTables() : Using given method all nested tables present in html document can be normalized. Need to pass Html document object created using HtmlAgilityPack

2)   ProcessIndvidualNestedTable():Using given method individual nested table can be normalized. Need to pass individual nested table object as string


   Before Use (Output)

A
The ABC
First Table
Second Table
/////text
Third Table
/////text
/////text
/////text

   After Use (Output)

A
The ABC
First Table
/////text
/////text

Second Table
/////text

Third Table
/////text


      Methods to Use:

1)  For Complete Html Document:

static HtmlAgilityPack.HtmlDocument ProcessNestedTables(ref HtmlAgilityPack.HtmlDocument objHTMLdoc)
{
    try
    {
        HtmlAgilityPack.HtmlNodeCollection tableList = objHTMLdoc.DocumentNode.SelectNodes("//table");
        if (tableList != null)
        {
            foreach (HtmlAgilityPack.HtmlNode table in tableList)
            {
                #region formatting of nested tables by Shrikant
                HtmlAgilityPack.HtmlNode ParentNode = table.ParentNode;
                if (ParentNode != null)
                {
                    int NestedTableCount = 1;
                    HtmlAgilityPack.HtmlNode HTMLNode = HtmlAgilityPack.HtmlNode.CreateNode("<div></div>");
                    HtmlAgilityPack.HtmlNode htmlbreak = HtmlAgilityPack.HtmlNode.CreateNode("<br>");
                    HTMLNode.AppendChild(htmlbreak);
                    GenerateTablesFromNestedTables(table, ref HTMLNode, ref NestedTableCount);
                    if (HTMLNode.InnerHtml.Contains("<table"))
                        ParentNode.InsertAfter(HTMLNode, table);
                }
                #endregion
            }
        }
    }
    catch (Exception ex)
    {
    }
    return objHTMLdoc;//OutPut
}

2)       For Individual Html Table:

public string ProcessIndividualNestedTable(string tableHtml)
{
    HtmlAgilityPack.HtmlDocument objHTMLdoc = new HtmlAgilityPack.HtmlDocument();
    try
    {
               
        divString = HttpUtility.UrlDecode(tableHtml).Replace("  """).Replace("\t""").Replace("\n""").Replace("\r""");
        objHTMLdoc.LoadHtml(tableHtml);
        objHTMLdoc = ProcessNestedTables(ref objHTMLdoc);              
    }
    catch (Exception ex)
    {}
    return objHTMLdoc.DocumentNode.OuterHtml; //Output
}


Supporting methods used inside for processing:

1)    static void GenerateTablesFromNestedTables(HtmlNode table, ref HtmlNode HTMLNode, ref int NestedLevel)
        {
            try
            {
                HtmlNode invalidInnerTable;
                if (table.ChildNodes != null && table.ChildNodes.Count > 0)
                {
                    invalidInnerTable = table.ChildNodes[0];
                    if (invalidInnerTable.Name == "table")
                    {
                        table.InnerHtml = invalidInnerTable.InnerHtml;
                    }
                }

                for (int trchild = 0; trchild < table.ChildNodes.Count; trchild++)
                {
                    HtmlNode trElement = table.ChildNodes[trchild];
                    for (int tdchild = 0; tdchild < trElement.ChildNodes.Count; tdchild++)
                    {
                        HtmlNode tdElement = trElement.ChildNodes[tdchild];
                        HtmlNode ChildTableNode = GetTableObject(ref tdElement);
                        if (ChildTableNode != null)
                        {
                            GenerateTablesFromNestedTables(ChildTableNode, ref HTMLNode, ref NestedLevel); //Recursive calling is involved here

                            HtmlNode FormattedChildTableNode = HtmlNode.CreateNode("<div><span>T-" + NestedLevel.ToString() + "</span>" + ChildTableNode.OuterHtml + "</div>");
                            HtmlNode htmlbreak = HtmlNode.CreateNode("<br>");
                            FormattedChildTableNode.AppendChild(htmlbreak);

                            if (HTMLNode.ChildNodes.Count > 0)
                            {
                                HTMLNode.InsertBefore(FormattedChildTableNode, HTMLNode.LastChild);
                            }
                            else
                            {
                                HTMLNode.AppendChild(FormattedChildTableNode);
                            }
                            HtmlNode htmlNestedMapping = HtmlNode.CreateNode("<span>T-" + NestedLevel.ToString() + "</span>");
                            NestedLevel += 1;
                            HtmlNode ParentChildTableNode = ChildTableNode.ParentNode;
                            if (ParentChildTableNode != null)
                            {
                                ParentChildTableNode.ReplaceChild(htmlNestedMapping, ChildTableNode);
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }

2)    static HtmlNode GetTableObject(ref HtmlNode tdNode)
        {
            try
            {
                if (tdNode.ChildNodes.Count > 0)
                {
                    for (int child = 0; child < tdNode.ChildNodes.Count; child++)
                    {
                        if (tdNode.ChildNodes[child].Name == "table")
                        {
                            return tdNode.ChildNodes[child];
                        }
                        else if (tdNode.ChildNodes[child].InnerHtml.Contains("<table"))
                        {
                            HtmlNode ChildNode = tdNode.ChildNodes[child];
                            return GetTableObject(ref ChildNode);
                        }
                    }
                }
                else if (tdNode.Name == "table")
                {
                    return tdNode;
                }
            }
            catch (Exception ex)
            {
                throw ex;
            }
            return null;
        }





2 comments:

  1. This comment has been removed by a blog administrator.

    ReplyDelete
  2. This comment has been removed by the author.

    ReplyDelete