R – word ifilter for docx parser error

ifilter

.Docx documents do not appear to be being indexed.

I used a unique string in a .docx, but the .docx is not returned when I search on "one".

For example here's the following text:

"Here is the text for line one and here is the text for line two."

Will be extracted via the iFilter as:

"Here is the text for line oneand here is the text for line two."

So when the Ifilter parses the .docx he deletes the line break separator and tries to parse "oneand here"… .

So it seems that the Word ifilter for .docx concatenates the last word of a line with the first word of the next line.

Can anyone give some ideas of how to get around this issue?

Thanks in advance.

Best Answer

OK I figured this one out now. Basically the 64 bit IFilter is not working correctly. It merges words that are separated by line breaks and does not carry them through. I used Ionic.zip to access the docx zip archive and parsed the important xml files using a slightly modified version of DocxToText. This works perfectly now.

Here is the modified code originally created by Jevgenij Pankov

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;

public class DocxToText
{
    private const string ContentTypeNamespace =
        @"http://schemas.openxmlformats.org/package/2006/content-types";

    private const string WordprocessingMlNamespace =
        @"http://schemas.openxmlformats.org/wordprocessingml/2006/main";

    private const string DocumentXmlXPath =
        "/t:Types/t:Override[@ContentType=\"" +
        "application/vnd.openxmlformats-officedocument." +
        "wordprocessingml.document.main+xml\"]";

    private const string BodyXPath = "/w:document/w:body";

    private string docxFile = "";
    private string docxFileLocation = "";

    public DocxToText(string fileName)
    {
        docxFile = fileName;
    }

    #region ExtractText()
    /// 

    /// Extracts text from the Docx file.

    /// 

    /// Extracted text.

    public string ExtractText()
    {
        if (string.IsNullOrEmpty(docxFile))
            throw new Exception("Input file not specified.");

        // Usually it is "/word/document.xml"


        docxFileLocation = FindDocumentXmlLocation();

        if (string.IsNullOrEmpty(docxFileLocation))
            throw new Exception("It is not a valid Docx file.");

        return ReadDocumentXml();
    }
    #endregion

    #region FindDocumentXmlLocation()
    /// 

    /// Gets location of the "document.xml" zip entry.

    /// 

    /// Location of the "document.xml".

    private string FindDocumentXmlLocation()
    {
        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                // Find "[Content_Types].xml" zip entry
                if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    //Create an XmlNamespaceManager for resolving namespaces


                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("t", ContentTypeNamespace);

                    // Find location of "document.xml"


                    XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
                        DocumentXmlXPath, nsmgr);

                    if (node != null)
                    {
                        string location =
                            ((XmlElement)node).GetAttribute("PartName");
                        return location.TrimStart(new char[] { '/' });
                    }
                    break;
                }
            }
        }
        return null;
    }
    #endregion

    #region ReadDocumentXml()
    /// 

    /// Reads "document.xml" zip entry.

    /// 

    /// Text containing in the document.

    private string ReadDocumentXml()
    {
        StringBuilder sb = new StringBuilder();

        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("w", WordprocessingMlNamespace);

                    XmlNode node =
                        xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);

                    if (node == null)
                        return string.Empty;

                    sb.Append(ReadNode(node));

                    break;
                }
            }
        }
        return sb.ToString();
    }
    #endregion

    #region ReadNode()
    /// 

    /// Reads content of the node and its nested childs.

    /// 

    /// XmlNode.

    /// Text containing in the node.

    private string ReadNode(XmlNode node)
    {
        if (node == null || node.NodeType != XmlNodeType.Element)
            return string.Empty;

        StringBuilder sb = new StringBuilder();
        foreach (XmlNode child in node.ChildNodes)
        {
            if (child.NodeType != XmlNodeType.Element) continue;

            switch (child.LocalName)
            {
                case "t": // Text

                    sb.Append(child.InnerText.TrimEnd());

                    string space =
                        ((XmlElement)child).GetAttribute("xml:space");
                    if (!string.IsNullOrEmpty(space) &&
                        space == "preserve")
                        sb.Append(' ');

                    break;

                case "cr":                          // Carriage return

                case "br":                          // Page break

                    sb.Append(Environment.NewLine);
                    break;

                case "tab":                         // Tab

                    sb.Append("\t");
                    break;

                case "p":                           // Paragraph

                    sb.Append(ReadNode(child));
                    sb.Append(Environment.NewLine);
                    sb.Append(Environment.NewLine);
                    break;

                default:
                    sb.Append(ReadNode(child));
                    break;
            }
        }
        return sb.ToString();
    }
    #endregion
}

Here is the usage of this code...

DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();
Related Topic