Thursday 13 March 2014

How to filter/search tags in html document using htmlAgilityPack

How to filter/search tags in html document using htmlAgilityPack

Ways to filter html elements based on filter criteria. Below given list using different filter criteria assigned to  sFilterCriteria variable which is later used for filtration

1)    string sFilterCriteria="//elementName";
Examples:
·         string sFilterCriteria = "//div";
·         string sFilterCriteria = "//img";
·         string sFilterCriteria = "//a";

      Description: To filter all <elementName> of given Html Document


2)    string sFilterCriteria="//elementName[@AttributeName]";
Examples:
·         string sFilterCriteria = "//div[@id]";
·         string sFilterCriteria = "//img[@alt]";      
·         string sFilterCriteria = "//p[@style]";
    
Description: To filter all <elementName> elements having id attribute of given Html Document

3)    string sFilterCriteria="//elementName[@AttributeName='AttributeValue']";
Examples:
·         string sFilterCriteria = "//div[@id='div1']";
·         string sFilterCriteria = "//a[@href='MrGST']";
·         string sFilterCriteria = "//img[@alt='title']";

 Description: To filter all <elementName> elements having  AttributeName attribute with value AttributeValue of given Html Document

4)    string sFilterCriteria="//*[@AttributeName]";
Examples:
·         string sFilterCriteria = "//*[@id]";
·         string sFilterCriteria = "//*[@href]";
·         string sFilterCriteria = "//*[@face]";
·         string sFilterCriteria = "//*[@alt]";
·         string sFilterCriteria = "//*[@src]";

 Description: To filter all html elements having  AttributeName attribute of given Html Document

5)    string sFilterCriteria="//*[@AttributeName='AttributeValue']";
Examples:
·         string sFilterCriteria = "//*[@id='div1']";
·         string sFilterCriteria = "//*[@href='MrGST']";
·         string sFilterCriteria = "//*[@href='MrGST']";

 Description: To filter all html elements having  AttributeName attribute with value AttributeValue of given Html Document

6)    string sFilterCriteria="//*[@AttributeName='AttributeValue']";
7)   Conditional filtration criteria
Examples:
·         string sFilterCriteria = "//img[@src and (@width or @height)]";
·         string sFilterCriteria = "//span[@lang='EN-US' and @style]";
·         string sFilterCriteria = "//*[contains(@style, 'Wingding')]";

[Note: In last example, contains is used to check if winding is present in style attribute or not]
Running the Filter Criteria for HTMLDocument
Use below code to get list of html elements present in document based of filter criteria sFilterCriteria
Create html document object
HtmlAgilityPack.HtmlDocument htmDoc = new HtmlAgilityPack.HtmlDocument();
htmDoc.LoadHtml(“<html>…………</html>”);

Use Search filter criteria sFilterCriteria
HtmlNodeCollection nc = doc.DocumentNode.SelectNodes(sFilterCriteria);
if (nc != null)
{
  foreach (HtmlNode node in nc)
  {
       //Logic body
  }
}

Example:
string sFilterCriteria = "//a[@target]";
HtmlNodeCollection nc = doc.DocumentNode.SelectNodes(sFilterCriteria);
if (nc != null)
{
  foreach (HtmlNode node in nc)
  {
       //Logic body
  }
}



Thursday 20 February 2014

How to add target attribute for all links in html document using C#, Dotnet ,HtmlAgilityPack


Adding target attribute for all links in html document using C#, Dotnet ,HtmlAgilityPack


#region "Description"
/*
    Description: There are ways that I am going to explain in this tutorial of adding target element for links in HTML document
    1)  Using Normal String Replacement
    2)  Using HTMLAgilityPack       
*/
#endregion       


//1)  Using Normal String Replacement

        /// <summary>
        /// add Target Elements to links
        /// </summary>
        /// <param name="doc">html string without target element</param>
        /// <returns>>html string with target element</returns>
        static string AddTargetElementsForLinksUsingReplacement(string HtmlString)
        {
            try
            {
                HtmlString = HtmlString.Replace("<a ", "<a target='_blank' ");  
            }
            catch (Exception ex)
            {
            }
            return HtmlString;
        }



//2)  Using HTMLAgilityPack    
        /// <summary>
        /// add Target Elements to links
        /// </summary>
        /// <param name="htmDoc">HTMLAgiltiyPack Object of html doc without target element</param>
        /// <returns>>HTMLAgiltiyPack Object of html doc with target element</returns>
        static HtmlDocument AddTargetElementsForLinks(HtmlDocument htmDoc)
{
    string strPreviousOuterHtml = string.Empty;         
    try
    {
        HtmlNodeCollection nc = htmDoc.DocumentNode.SelectNodes("//a");
        if (nc != null)
        {
            foreach (HtmlNode node in nc)
            {
                strPreviousOuterHtml = node.OuterHtml;
                if (node.Attributes["target"] != null)
                    node.Attributes["target"].Value = "_blank";
                else
                    node.Attributes.Add("target", "_blank");

                htmDoc.DocumentNode.InnerHtml = htmDoc.DocumentNode.InnerHtml.Replace(strPreviousOuterHtml, node.OuterHtml);                      
            }
        }
    }
    catch (Exception ex)
    {
    }
    return htmDoc;
}



        /// <summary>
        /// Adds Target Elemetns to links
        /// </summary>
        /// <param name="doc">html string without target element</param>
        /// <returns>>html string with target element</returns>
        static string AddTargetElementsForLinks(string HtmlString)
         {
             string strPreviousOuterHtml = string.Empty;
             HtmlDocument htmDoc = new HtmlDocument();
             try
             {               
                 htmDoc.LoadHtml(HtmlString);
                 HtmlNodeCollection nc = htmDoc.DocumentNode.SelectNodes("//a");
                 if (nc != null)
                 {
                     foreach (HtmlNode node in nc)
                     {
                         strPreviousOuterHtml = node.OuterHtml;
                         if (node.Attributes["target"] != null)
                             node.Attributes["target"].Value = "_blank";
                         else
                             node.Attributes.Add("target", "_blank");

                         htmDoc.DocumentNode.InnerHtml = htmDoc.DocumentNode.InnerHtml.Replace(strPreviousOuterHtml, node.OuterHtml);
                     }
                 }
             }
             catch (Exception ex)
             {
             }
             return  htmDoc.DocumentNode.OuterHtml;
         }


How to convert excel objects in word document to html using Aspose , HTMLAgilityPack

Description: In This tutorial, we will learn Converting excel embedded word documents to HTML Document with proper conversion of embedded excel documents to their respective HTML content also with proper placement as that in Word Document using ASPOSE.WORD and HtmlAgilityPack

(Note: Aspose is third party tool used to work on Microsoft documents like Word, Excel, and PDF etc. Also, this logic never works for linked Excel file)

Diagrammatic representation:

                                                                                                         



Word Doc with Excel Objects

|
V

Conversion Process

|
V

HTML Doc with Excel contents










                                                                                                         
                                                                                                         



Steps in Conversion Process


1.    Mark Excel Object Positioning using Aspose.Word

2.    Convert Main word document into html using Aspose.Word conversion

3.    Now convert each excel object in word document into respective html doc using Aspose.cells and replace that excel  derived html content in main word derived html content using marked excel position




Word Doc as Input:



Word Content 1

Excel object 1

Word Content 2

Excel object 2



 






                                                                                                         
HTML Doc as Output:
                                                                                                         


Word HTML Content 1

Excel HTML Content 1

Word HTML Content 2

Excel HTML Content 2









                                                  
                                                  


Method execution Order:

1)    objDocument = ExcelTableMarking(objDocument); //First Method

2)    objDocument = GetHtmlDataOfAllExcelFiles(objHTMLdoc, objDocument); //Second Method

  Both methods are used in ExampleRun method for example purpose as give below

  string OutputHTML = ExampleRun(WordDocumentFilePath);

                                                  
                                                  


Method definitions:
                                                  
                                                  

public static string ExampleRun(string WordDocumentFilePath)
        {       
                string html = string.Empty;                 
                HtmlAgilityPack.HtmlDocument htmlDocument = new HtmlAgilityPack.HtmlDocument();

                using (MemoryStream htmlStream = new MemoryStream())
                {                      
                    //Create Aspose word document object
                    Aspose.Words.Document objDocument = new Aspose.Words.Document(WordDocumentFilePath);

                    //provide the setting for image objects in word document
                    Aspose.Words.Saving.HtmlSaveOptions lSaveOptions = new Aspose.Words.Saving.HtmlSaveOptions(Aspose.Words.SaveFormat.Html);
                    lSaveOptions.ExportImagesAsBase64 = true;//For Embedded image | false to download image to a folder after conversion
                    lSaveOptions.ImagesFolder = Server.MapPath("~/Images");
                    lSaveOptions.ImagesFolderAlias = "../../Images";

                    //Mark Excel objects in word document
                    objDocument = ExcelTableMarking(objDocument,null,null); //Main
                    objDocument.AcceptAllRevisions();

                    //Convert  Word document to html
                    objDocument.Save(htmlStream, lSaveOptions);// Get Html string 

                    //Read html string from HTML memoryStream
                    html = System.Text.Encoding.UTF8.GetString(htmlStream.GetBuffer(), 0, (int)htmlStream.Length);

                    //Load Html document using HtmlAgilityPack
                    htmlDocument.LoadHtml(html);

                    //Replace all excel objects with their respective HTML contents
                    htmlDocument = GetHtmlDataOfAllExcelFiles(htmlDocument, objDocument);

                }
                return htmlDocument.DocumentNode.OuterHtml;
        }

public static Document ExcelTableMarking(Document doc)
        {
            string extension = string.Empty;
            string FileName = string.Empty;
            string FilePath = string.Empty;
            DocumentBuilder builder = new DocumentBuilder(doc);
            NodeCollection Shapes = doc.GetChildNodes(NodeType.Shape, true);          
            int ExcelNo = 1;
            try
            {
                foreach (Aspose.Words.Drawing.Shape shape in Shapes)
                {
                    if (shape.OleFormat != null)
                    {
                        try
                        {
                            extension = shape.OleFormat.SuggestedExtension.ToString();
                            if ((extension.Contains(".xls") || extension.Contains(".xlsx")) && shape.OleFormat.IsLink == false)
                            {
                                shape.AlternativeText = "$EXCEL$" + ExcelNo;
                                ExcelNo += 1;
                            }
                        }
                        catch
                        {
                        }
                    }
                }
            }
            catch (Exception ex)
            {              
            }
            return doc;
        }

static HtmlDocument GetHtmlDataOfAllExcelFiles(HtmlDocument doc, Document Maindoc)
        {
            string html = doc.DocumentNode.OuterHtml;
            string singleFileHtml = string.Empty;

            try
            {
                HtmlNodeCollection nc = doc.DocumentNode.SelectNodes(".//img");
                if (nc != null)
                {
                    foreach (HtmlNode node in nc)
                    {
                        string ImageSource = node.OuterHtml;
                        if (Regex.IsMatch(ImageSource, @"$.*$"))
                        {
                            string FileName = node.Attributes["alt"].Value;
                            if (FileName.Contains("$"))
                            {
                                FileName = FileName.Replace("$", "");
                                singleFileHtml = GetHtmlDataOfExcelFile(FileName, Maindoc);
                                html = html.Replace(ImageSource, singleFileHtml + "<br/>");
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
            }
            doc.LoadHtml(html);
            return doc;
        }



  static string GetHtmlDataOfExcelFile(string ExcelLinkName, Document Maindoc)
        {
            string html = string.Empty;
            try
            {
                MemoryStream FileStream = new MemoryStream();
                string extension = string.Empty;
                NodeCollection Shapes = Maindoc.GetChildNodes(NodeType.Shape, true);
                foreach (Aspose.Words.Drawing.Shape shape in Shapes)
                {
                    if (shape.OleFormat != null)
                    {
                        extension = shape.OleFormat.SuggestedExtension.ToString();
                        if ((extension.Contains(".xls") || extension.Contains(".xlsx")) && shape.OleFormat.IsLink == false)
                        {
                            string ExcelName = shape.AlternativeText.Replace("$", "");
                            if (ExcelName == ExcelLinkName)
                            {
                                shape.OleFormat.Save(FileStream); break;
                            }

                        }
                    }
                }
                if (FileStream != null)
                {
                    Aspose.Cells.Workbook workbook = new Aspose.Cells.Workbook(FileStream);
                    using (MemoryStream htmlStream = new MemoryStream())
                    {
                        workbook.Save(htmlStream, Aspose.Cells.SaveFormat.Html);
                        html = System.Text.Encoding.UTF8.GetString(htmlStream.GetBuffer(), 0, (int)htmlStream.Length);
                        while (html[0] != '<')
                            html = html.Substring(1);
                    }
                }

                if (html != "")
                {                 
                    HtmlDocument doc = new HtmlDocument();
                    HtmlNodeCollection nc = null;                                      
                    doc.LoadHtml(html);
                    #region Body Capture
                    nc = doc.DocumentNode.SelectNodes("//body");
                    {
                        if (nc != null)
                        {
                            foreach (HtmlNode node in nc)
                            {
                                html = node.InnerHtml;
                                break;
                            }
                        }
                    }
                    doc.LoadHtml(html);
                    #endregion                  
                    html = doc.DocumentNode.OuterHtml;
                }
            }
            catch
            {
            }
            return html;
        }