diff -Nur a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c
--- a/src/libtracker-extract/tracker-utils.c	2010-09-02 14:51:00.000000000 +0200
+++ b/src/libtracker-extract/tracker-utils.c	2011-01-04 18:27:05.301708634 +0100
@@ -233,6 +233,61 @@
 }
 
 /**
+ * tracker_text_validate_utf8:
+ * @text: the text to validate
+ * @text_len: length of @text, or -1 if NUL-terminated
+ * @str: the string where to place the validated UTF-8 characters, or %NULL if
+ *  not needed.
+ * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
+ *
+ * This function iterates through @text checking for UTF-8 validity
+ * using g_utf8_validate(), appends the first chunk of valid characters
+ * to @str, and gives the number of valid UTF-8 bytes in @valid_len.
+ *
+ * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
+ *
+ * Since: 0.9
+ **/
+gboolean
+tracker_text_validate_utf8 (const gchar  *text,
+                            gssize        text_len,
+                            GString     **str,
+                            gsize        *valid_len)
+{
+	gsize len_to_validate;
+
+	g_return_val_if_fail (text, FALSE);
+
+	len_to_validate = text_len >= 0 ? text_len : strlen (text);
+
+	if (len_to_validate > 0) {
+		const gchar *end = text;
+
+		/* Validate string, getting the pointer to first non-valid character
+		 *  (if any) or to the end of the string. */
+		g_utf8_validate (text, len_to_validate, &end);
+		if (end > text) {
+			/* If str output required... */
+			if (str) {
+				/* Create string to output if not already as input */
+				*str = (*str == NULL ?
+				        g_string_new_len (text, end - text) :
+				        g_string_append_len (*str, text, end - text));
+			}
+
+			/* If utf8 len output required... */
+			if (valid_len) {
+				*valid_len = end - text;
+			}
+
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+/**
  * tracker_date_format_to_iso8601:
  * @date_string: the date in a string pointer
  * @format: the format of the @date_string
diff -Nur a/src/libtracker-extract/tracker-utils.h b/src/libtracker-extract/tracker-utils.h
--- a/src/libtracker-extract/tracker-utils.h	2010-09-02 14:51:00.000000000 +0200
+++ b/src/libtracker-extract/tracker-utils.h	2011-01-04 18:28:26.122960742 +0100
@@ -34,6 +34,10 @@
 gchar *tracker_text_normalize         (const gchar *text,
                                        guint        max_words,
                                        guint       *n_words);
+gboolean tracker_text_validate_utf8   (const gchar  *text,
+                                       gssize        text_len,
+                                       GString     **str,
+                                       gsize        *valid_len);
 gchar *tracker_date_guess             (const gchar *date_string);
 gchar *tracker_date_format_to_iso8601 (const gchar *date_string,
                                        const gchar *format);
diff -Nur a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c
--- a/src/tracker-extract/tracker-extract-msoffice.c	2010-09-02 16:22:50.000000000 +0200
+++ b/src/tracker-extract/tracker-extract-msoffice.c	2011-01-04 18:25:15.706523770 +0100
@@ -2177,7 +2177,7 @@
 		 * using the given context */
 		tracker_gsf_parse_xml_in_zip (parser_info->uri,
 		                              xml_filename,
-		                              context);
+		                              context, NULL);
 		g_markup_parse_context_free (context);
 	}
 
@@ -2335,7 +2335,7 @@
 	 * using the given context */
 	tracker_gsf_parse_xml_in_zip (uri,
 	                              "[Content_Types].xml",
-	                              context);
+	                              context, NULL);
 
 	if (info.content) {
 		gchar *content;
diff -Nur a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c
--- a/src/tracker-extract/tracker-extract-oasis.c	2010-09-02 14:51:00.000000000 +0200
+++ b/src/tracker-extract/tracker-extract-oasis.c	2011-01-04 19:19:50.938606364 +0100
@@ -24,6 +24,7 @@
 
 #include "tracker-main.h"
 #include "tracker-gsf.h"
+#include "tracker-read.h"
 
 #include <unistd.h>
 
@@ -36,142 +37,126 @@
 	ODT_TAG_TYPE_COMMENTS,
 	ODT_TAG_TYPE_STATS,
 	ODT_TAG_TYPE_CREATED,
-	ODT_TAG_TYPE_GENERATOR
+	ODT_TAG_TYPE_GENERATOR,
+	ODT_TAG_TYPE_WORD_TEXT,
+	ODT_TAG_TYPE_SLIDE_TEXT,
+	ODT_TAG_TYPE_SPREADSHEET_TEXT
 } ODTTagType;
 
+typedef enum {
+	FILE_TYPE_INVALID,
+	FILE_TYPE_ODP,
+	FILE_TYPE_ODT,
+	FILE_TYPE_ODS
+} ODTFileType;
+
 typedef struct {
 	TrackerSparqlBuilder *metadata;
 	ODTTagType current;
 	const gchar *uri;
 	gboolean title_already_set;
-} ODTParseInfo;
+} ODTMetadataParseInfo;
 
-static void xml_start_element_handler (GMarkupParseContext   *context,
-                                       const gchar           *element_name,
-                                       const gchar          **attribute_names,
-                                       const gchar          **attribute_values,
-                                       gpointer               user_data,
-                                       GError               **error);
-static void xml_end_element_handler   (GMarkupParseContext   *context,
-                                       const gchar           *element_name,
-                                       gpointer               user_data,
-                                       GError               **error);
-static void xml_text_handler          (GMarkupParseContext   *context,
-                                       const gchar           *text,
-                                       gsize                  text_len,
-                                       gpointer               user_data,
-                                       GError               **error);
-static void extract_oasis             (const gchar           *filename,
-                                       TrackerSparqlBuilder  *preupdate,
-                                       TrackerSparqlBuilder  *metadata);
+typedef struct {
+	ODTTagType current;
+	gboolean styles_present;
+	ODTFileType file_type;
+	GString *content;
+	gulong bytes_pending;
+} ODTContentParseInfo;
+
+GQuark maximum_size_error_quark = 0;
+
+static void xml_start_element_handler_metadata (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                const gchar          **attribute_names,
+                                                const gchar          **attribute_values,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_end_element_handler_metadata   (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_text_handler_metadata          (GMarkupParseContext   *context,
+                                                const gchar           *text,
+                                                gsize                  text_len,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_start_element_handler_content  (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                const gchar          **attribute_names,
+                                                const gchar          **attribute_values,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_end_element_handler_content    (GMarkupParseContext   *context,
+                                                const gchar           *element_name,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void xml_text_handler_content           (GMarkupParseContext   *context,
+                                                const gchar           *text,
+                                                gsize                  text_len,
+                                                gpointer               user_data,
+                                                GError               **error);
+static void extract_oasis                      (const gchar           *filename,
+                                                TrackerSparqlBuilder  *preupdate,
+                                                TrackerSparqlBuilder  *metadata);
+static void extract_oasis_content              (const gchar           *uri,
+                                                gulong                 total_bytes,
+                                                ODTFileType            file_type,
+                                                TrackerSparqlBuilder  *metadata);
 
 static TrackerExtractData extract_data[] = {
 	{ "application/vnd.oasis.opendocument.*", extract_oasis },
 	{ NULL, NULL }
 };
 
-
-#define ODT_BUFFER_SIZE            8193  /* bytes */
-
-static gchar *
-extract_oasis_content (const gchar *uri,
-                       guint        n_words,
-                       gsize        n_bytes)
+static void
+extract_oasis_content (const gchar          *uri,
+                       gulong                total_bytes,
+                       ODTFileType           file_type,
+                       TrackerSparqlBuilder *metadata)
 {
-	const gchar *argv[4];
-	gint fdz;
-	FILE *fz;
+	gchar *content;
+	ODTContentParseInfo info;
+	GMarkupParseContext *context;
 	GError *error = NULL;
-	gchar *text = NULL;
-	gchar *path;
+	GMarkupParser parser = {
+		xml_start_element_handler_content,
+		xml_end_element_handler_content,
+		xml_text_handler_content,
+		NULL,
+		NULL
+	};
 
-	/* Newly allocated string with the file path */
-	path = g_filename_from_uri (uri, NULL, NULL);
+	/* Create parse info */
+	info.current = ODT_TAG_TYPE_UNKNOWN;
+	info.file_type = file_type;
+	info.styles_present = FALSE;
+	info.content = g_string_new ("");
+	info.bytes_pending = total_bytes;
 
-	/* Setup command to be executed */
-	argv[0] = "odt2txt";
-	argv[1] = "--encoding=utf-8";
-	argv[2] = path;
-	argv[3] = NULL;
-
-	g_debug ("Executing command:'%s %s %s' (max words: %u, "
-	         "max_bytes: %" G_GSIZE_FORMAT ")",
-	         argv[0], argv[1], argv[2], n_words, n_bytes);
-
-	/* Fork & spawn */
-	if (!g_spawn_async_with_pipes (g_get_tmp_dir (),
-	                               (gchar **)argv,
-	                               NULL,
-	                               G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL,
-	                               tracker_spawn_child_func,
-	                               GINT_TO_POINTER (10),
-	                               NULL,
-	                               NULL,
-	                               &fdz,
-	                               NULL,
-	                               &error)) {
-		g_warning ("Spawning failed, could not extract text from '%s': %s",
-		           path, error ? error->message : NULL);
-		g_clear_error (&error);
-	}
-	/* Open file descriptor for reading */
-	else if ((fz = fdopen (fdz, "r")) == NULL) {
-		g_warning ("Cannot read child's output... could not extract "
-		           "text from '%s'", path);
-		close (fdz);
-	}
-	/* Start buffered reading... */
-	else {
-		unsigned char buf[ODT_BUFFER_SIZE];
-		size_t r, accum;
-		guint n_words_remaining = n_words;
-		GString *normalized;
-
-		accum = 0;
-		normalized = g_string_new ("");
-
-		/* Reading in chunks of ODT_BUFFER_SIZE -1 (8192)
-		 *   Loop is halted whenever one of this conditions is met:
-		 *     a) Read bytes reached the maximum allowed (n_bytes)
-		 *     b) Already read up to the max number of words configured
-		 *     c) No more bytes to read
-		 */
-		while ((accum <= n_bytes) &&
-		       (n_words_remaining > 0) &&
-		       (r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) {
-			gchar *normalized_chunk;
-			guint n_words_normalized;
-
-			/* Always make sure that the read string will be
-			 * NIL-terminated  */
-			buf[r] = '\0';
-			/* Get normalized chunk */
-			normalized_chunk = tracker_text_normalize (buf,
-			                                           n_words_remaining,
-			                                           &n_words_normalized);
-			/* Update number of words remaining.
-			 * Note that n_words_normalized should always be less or
-			 * equal than n_words_remaining */
-			n_words_remaining = (n_words_normalized <= n_words_remaining ?
-			                     n_words_remaining - n_words_normalized : 0);
-			/* Update accumulated */
-			accum += r;
-
-			/* Add normalized chunk to the whole normalized string */
-			g_string_append (normalized, normalized_chunk);
-			g_free (normalized_chunk);
-		}
+	/* Create parsing context */
+	context = g_markup_parse_context_new (&parser, 0, &info, NULL);
 
-		/* fclose() the stream, no need to close() the original FD */
-		fclose (fz);
+	/* Load the internal XML file from the Zip archive, and parse it
+	 * using the given context */
+	tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error);
 
-		/* Set final normalized contents to return */
-		text = g_string_free (normalized, FALSE);
+	if (!error || g_error_matches (error, maximum_size_error_quark, 0)) {
+		content = g_string_free (info.content, FALSE);
+		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
+		tracker_sparql_builder_object_unvalidated (metadata, content);
+	} else {
+		g_warning ("Got error parsing XML file: %s\n", error->message);
+		g_string_free (info.content, TRUE);
 	}
 
-	g_free (path);
+	if (error) {
+		g_error_free (error);
+	}
 
-	return text;
+	g_free (content);
+	g_markup_parse_context_free (context);
 }
 
 static void
@@ -179,20 +164,25 @@
                TrackerSparqlBuilder *preupdate,
                TrackerSparqlBuilder *metadata)
 {
-	gchar *content;
 	TrackerFTSConfig *fts_config;
-	guint n_words;
-	gsize n_bytes;
-	ODTParseInfo info;
+	ODTMetadataParseInfo info;
+	ODTFileType file_type;
+	GFile *file = NULL;
+	GFileInfo *file_info = NULL;
+	const gchar *mime_used;
 	GMarkupParseContext *context;
 	GMarkupParser parser = {
-		xml_start_element_handler,
-		xml_end_element_handler,
-		xml_text_handler,
+		xml_start_element_handler_metadata,
+		xml_end_element_handler_metadata,
+		xml_text_handler_metadata,
 		NULL,
 		NULL
 	};
 
+	if (G_UNLIKELY (maximum_size_error_quark == 0)) {
+		maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error");
+	}
+
 	/* Setup conf */
 	fts_config = tracker_main_get_fts_config ();
 
@@ -214,37 +204,62 @@
 
 	/* Load the internal XML file from the Zip archive, and parse it
 	 * using the given context */
-	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context);
+	tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL);
 	g_markup_parse_context_free (context);
 
 	/* Next, parse contents */
+	file = g_file_new_for_uri (uri);
 
-	/* Set max words to read from content */
-	n_words = tracker_fts_config_get_max_words_to_index (fts_config);
+	if (!file) {
+		g_warning ("Could not create GFile for URI:'%s'",
+		           uri);
+		return;
+	}
 
-	/* Set max bytes to read from content.
-	 * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode
-	 *  points are really pretty rare */
-	n_bytes = 3 * n_words * tracker_fts_config_get_max_word_length(fts_config);
+	file_info = g_file_query_info (file,
+	                               G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE,
+	                               G_FILE_QUERY_INFO_NONE,
+	                               NULL,
+	                               NULL);
+	g_object_unref (file);
 
-	/* Extract content with the given limitations */
-	content = extract_oasis_content (uri, n_words, n_bytes);
-	if (content) {
-		tracker_sparql_builder_predicate (metadata, "nie:plainTextContent");
-		tracker_sparql_builder_object_unvalidated (metadata, content);
-		g_free (content);
+	if (!file_info) {
+		g_warning ("Could not get GFileInfo for URI:'%s'",
+		           uri);
+		return;
+	}
+
+	mime_used = g_file_info_get_content_type (file_info);
+
+	if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) {
+		file_type = FILE_TYPE_ODT;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) {
+		file_type = FILE_TYPE_ODP;
+	} else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) {
+		file_type = FILE_TYPE_ODS;
+	} else {
+		g_message ("Mime type was not recognised:'%s'", mime_used);
+		file_type = FILE_TYPE_INVALID;
 	}
+
+	g_object_unref (file_info);
+
+	/* Extract content with the given limitations */
+	extract_oasis_content (uri, 
+	                       3 * tracker_fts_config_get_max_words_to_index (fts_config) * tracker_fts_config_get_max_word_length (fts_config), 
+	                       file_type, 
+	                       metadata);
 }
 
 static void
-xml_start_element_handler (GMarkupParseContext  *context,
-                           const gchar          *element_name,
-                           const gchar         **attribute_names,
-                           const gchar         **attribute_values,
-                           gpointer              user_data,
-                           GError              **error)
+xml_start_element_handler_metadata (GMarkupParseContext  *context,
+                                    const gchar          *element_name,
+                                    const gchar         **attribute_names,
+                                    const gchar         **attribute_values,
+                                    gpointer              user_data,
+                                    GError              **error)
 {
-	ODTParseInfo *data = user_data;
+	ODTMetadataParseInfo *data = user_data;
 
 	if (g_ascii_strcasecmp (element_name, "dc:title") == 0) {
 		data->current = ODT_TAG_TYPE_TITLE;
@@ -285,22 +300,22 @@
 }
 
 static void
-xml_end_element_handler (GMarkupParseContext  *context,
-                         const gchar          *element_name,
-                         gpointer              user_data,
-                         GError              **error)
+xml_end_element_handler_metadata (GMarkupParseContext  *context,
+                                  const gchar          *element_name,
+                                  gpointer              user_data,
+                                  GError              **error)
 {
-	((ODTParseInfo*) user_data)->current = -1;
+	((ODTMetadataParseInfo*) user_data)->current = -1;
 }
 
 static void
-xml_text_handler (GMarkupParseContext  *context,
-                  const gchar          *text,
-                  gsize                 text_len,
-                  gpointer              user_data,
-                  GError              **error)
+xml_text_handler_metadata (GMarkupParseContext  *context,
+                           const gchar          *text,
+                           gsize                 text_len,
+                           gpointer              user_data,
+                           GError              **error)
 {
-	ODTParseInfo *data;
+	ODTMetadataParseInfo *data;
 	TrackerSparqlBuilder *metadata;
 	const gchar *uri;
 	gchar *date;
@@ -378,6 +393,150 @@
 		break;
 	}
 }
+
+static void
+xml_start_element_handler_content (GMarkupParseContext  *context,
+                                   const gchar          *element_name,
+                                   const gchar         **attribute_names,
+                                   const gchar         **attribute_values,
+                                   gpointer              user_data,
+                                   GError              **error)
+{
+	ODTContentParseInfo *data = user_data;
+	const gchar **a;
+	const gchar **v;
+
+	switch (data->file_type) {
+	case FILE_TYPE_ODT:
+		if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
+		    data->styles_present = TRUE;
+		} else if (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) {
+			data->current = ODT_TAG_TYPE_WORD_TEXT;
+		} else if (g_ascii_strcasecmp (element_name, "text:p") == 0) {
+			if (data->styles_present) {
+				data->current = ODT_TAG_TYPE_WORD_TEXT;
+				break;
+			}
+
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
+					continue;
+				}
+
+				if ((g_ascii_strcasecmp (*v, "title-article") == 0) ||
+				    (g_ascii_strcasecmp (*v, "para-padding") == 0) ||
+				    (g_ascii_strcasecmp (*v, "para-screen") == 0)) {
+					data->current = ODT_TAG_TYPE_WORD_TEXT;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "text:h") == 0) {
+			for (a = attribute_names, v = attribute_values; *a; ++a, ++v) {
+				if (g_ascii_strcasecmp (*a, "text:style-name") != 0) {
+					continue;
+				}
+
+				if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) {
+					data->current = ODT_TAG_TYPE_WORD_TEXT;
+				}
+			}
+		} else if (g_ascii_strcasecmp (element_name, "text:span") == 0) {
+			data->current = ODT_TAG_TYPE_WORD_TEXT;
+		} else if ((g_ascii_strcasecmp (element_name, "text:a") == 0) ||
+			   (g_ascii_strcasecmp (element_name, "text:s") == 0)) {
+			data->current = ODT_TAG_TYPE_WORD_TEXT;
+		} else {
+			data->current = -1;
+		}
+		break;
+
+	case FILE_TYPE_ODP:
+		data->current = ODT_TAG_TYPE_SLIDE_TEXT;
+		break;
+
+	case FILE_TYPE_ODS:
+		if (g_ascii_strncasecmp (element_name, "text", 4) == 0) {
+			data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT;
+		} else {
+			data->current = -1;
+		}
+		break;
+
+	case FILE_TYPE_INVALID:
+		g_message ("Open Office Document type: %d invalid", data->file_type);
+		break;
+	}
+}
+
+static void
+xml_end_element_handler_content (GMarkupParseContext  *context,
+                                 const gchar          *element_name,
+                                 gpointer              user_data,
+                                 GError              **error)
+{
+	ODTContentParseInfo *data = user_data;
+
+	switch (data->file_type) {
+	case FILE_TYPE_ODT:
+		if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:table-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) ||
+		    (g_ascii_strcasecmp (element_name, "text:section") == 0)) {
+		    data->styles_present = FALSE;
+		}
+		break;
+	default:
+		break;
+	}
+
+	if ((g_ascii_strcasecmp (element_name, "text:a") != 0) &&
+	    (g_ascii_strcasecmp (element_name, "text:s") != 0)) {
+		data->current = -1;
+	}
+}
+
+static void
+xml_text_handler_content (GMarkupParseContext  *context,
+                          const gchar          *text,
+                          gsize                 text_len,
+                          gpointer              user_data,
+                          GError              **error)
+{
+	ODTContentParseInfo *data = user_data;
+	gsize written_bytes = 0;
+
+	switch (data->current) {
+	case ODT_TAG_TYPE_WORD_TEXT:
+	case ODT_TAG_TYPE_SLIDE_TEXT:
+	case ODT_TAG_TYPE_SPREADSHEET_TEXT:
+                if (data->bytes_pending == 0) {
+                        g_set_error_literal (error,
+                                             maximum_size_error_quark, 0,
+                                             "Maximum text limit reached");
+                        break;
+                }
+
+		/* Look for valid UTF-8 text */
+		if (tracker_text_validate_utf8 (text,
+		                                MIN (text_len, data->bytes_pending),
+		                                &data->content,
+		                                &written_bytes)) {
+			if (data->content->str[data->content->len - 1] != ' ') {
+				/* If some bytes found to be valid, append an extra whitespace
+				 * as separator */
+				g_string_append_c (data->content, ' ');
+			}
+		}
+
+		data->bytes_pending -= written_bytes;
+		break;
+
+	default:
+		break;
+	}
+}
 
 TrackerExtractData *
 tracker_extract_get_data (void)
diff -Nur a/src/tracker-extract/tracker-gsf.c b/src/tracker-extract/tracker-gsf.c
--- a/src/tracker-extract/tracker-gsf.c	2010-09-02 14:51:00.000000000 +0200
+++ b/src/tracker-extract/tracker-gsf.c	2011-01-04 18:25:15.709857887 +0100
@@ -77,9 +77,10 @@
  *  maximum size of the uncompressed XML file is limited to be to 20MBytes.
  */
 void
-tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
-                              const gchar         *xml_filename,
-                              GMarkupParseContext *context)
+tracker_gsf_parse_xml_in_zip (const gchar          *zip_file_uri,
+                              const gchar          *xml_filename,
+                              GMarkupParseContext  *context,
+                              GError              **err)
 {
 	gchar *filename;
 	GError *error = NULL;
@@ -124,7 +125,8 @@
 		chunk_size = MIN (remaining_size, XML_BUFFER_SIZE);
 
 		accum = 0;
-		while (accum  <= XML_MAX_BYTES_READ &&
+		while (!error &&
+		       accum  <= XML_MAX_BYTES_READ &&
 		       chunk_size > 0 &&
 		       gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) {
 
@@ -132,7 +134,7 @@
 			accum += chunk_size;
 
 			/* Pass the read stream to the context parser... */
-			g_markup_parse_context_parse (context, buf, chunk_size, NULL);
+			g_markup_parse_context_parse (context, buf, chunk_size, &error);
 
 			/* update bytes to be read */
 			remaining_size -= chunk_size;
@@ -141,8 +143,9 @@
 	}
 
 	g_free (filename);
+
 	if (error)
-		g_error_free (error);
+		g_propagate_error (err, error);
 	if (infile)
 		g_object_unref (infile);
 	if (src)
diff -Nur a/src/tracker-extract/tracker-gsf.h b/src/tracker-extract/tracker-gsf.h
--- a/src/tracker-extract/tracker-gsf.h	2010-09-02 14:51:00.000000000 +0200
+++ b/src/tracker-extract/tracker-gsf.h	2011-01-04 18:25:15.713192004 +0100
@@ -25,9 +25,10 @@
 
 G_BEGIN_DECLS
 
-void tracker_gsf_parse_xml_in_zip (const gchar         *zip_file_uri,
-                                   const gchar         *xml_filename,
-                                   GMarkupParseContext *context);
+void tracker_gsf_parse_xml_in_zip (const gchar          *zip_file_uri,
+                                   const gchar          *xml_filename,
+                                   GMarkupParseContext  *context,
+                                   GError              **error);
 
 G_END_DECLS
 
diff -Nur a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c
--- a/src/tracker-extract/tracker-read.c	1970-01-01 01:00:00.000000000 +0100
+++ b/src/tracker-extract/tracker-read.c	2011-01-04 18:25:15.713192004 +0100
@@ -0,0 +1,279 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#include <string.h>
+#include <unistd.h>
+
+#include <glib.h>
+#include <gio/gio.h>
+
+#include <libtracker-extract/tracker-extract.h>
+
+#include "tracker-read.h"
+
+/* Size of the buffer to use when reading, in bytes */
+#define BUFFER_SIZE 65535
+
+static GString *
+get_string_in_locale (GString *s)
+{
+	GError *error = NULL;
+	gchar *str;
+	gsize bytes_read;
+	gsize bytes_written;
+
+	str = g_locale_to_utf8 (s->str,
+	                        s->len,
+	                        &bytes_read,
+	                        &bytes_written,
+	                        &error);
+	if (error) {
+		g_debug ("  Conversion to UTF-8 read %" G_GSIZE_FORMAT " bytes, wrote %" G_GSIZE_FORMAT " bytes",
+		         bytes_read,
+		         bytes_written);
+		g_message ("Could not convert string from locale to UTF-8, %s",
+		           error->message);
+		g_error_free (error);
+		g_free (str);
+	} else {
+		g_string_assign (s, str);
+		g_free (str);
+	}
+
+	return s;
+}
+
+
+/* Returns %TRUE if read operation should continue, %FALSE otherwise */
+static gboolean
+process_chunk (const gchar  *read_bytes,
+               gsize         read_size,
+               gsize         buffer_size,
+               gsize        *remaining_size,
+               GString     **s)
+{
+	/* If no more bytes to read, halt loop */
+	if (read_size == 0) {
+		return FALSE;
+	}
+
+	/* First of all, check if this is the first time we
+	 * have tried to read the stream up to the BUFFER_SIZE
+	 * limit. Then make sure that we read the maximum size
+	 * of the buffer. If we don't do this, there is the
+	 * case where we read 10 bytes in and it is just one
+	 * line with no '\n'. Once we have confirmed this we
+	 * check that the buffer has a '\n' to make sure the
+	 * file is worth indexing. Similarly if the file has
+	 * <= 3 bytes then we drop it.
+	 */
+	if (*s == NULL) {
+		if (read_size == buffer_size &&
+		    g_strstr_len (read_bytes, read_size, "\n") == NULL) {
+			g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
+			         "not indexing file",
+			         read_size);
+			return FALSE;
+		} else if (read_size <= 2) {
+			g_debug ("  File has less than 3 characters in it, "
+			         "not indexing file");
+			return FALSE;
+		}
+	}
+
+	/* Update remaining bytes */
+	*remaining_size -= read_size;
+
+	g_debug ("  Read "
+	         "%" G_GSSIZE_FORMAT " bytes from file, %" G_GSIZE_FORMAT " "
+	         "bytes remaining until configured threshold is reached",
+	         read_size,
+	         *remaining_size);
+
+	/* Append non-NIL terminated bytes */
+	*s = (*s ?
+	      g_string_append_len (*s, read_bytes, read_size) :
+	      g_string_new_len (read_bytes, read_size));
+
+	return TRUE;
+}
+
+static gchar *
+process_whole_string (GString  *s,
+                      gboolean  try_locale_if_not_utf8)
+{
+	gsize n_valid_utf8_bytes = 0;
+
+	/* Get number of valid UTF-8 bytes found */
+	tracker_text_validate_utf8 (s->str,
+	                            s->len,
+	                            NULL,
+	                            &n_valid_utf8_bytes);
+
+	/* A valid UTF-8 file will be that where all read bytes are valid,
+	 *  with a margin of 3 bytes for the last UTF-8 character which might
+	 *  have been cut. */
+	if (try_locale_if_not_utf8 &&
+	    s->len - n_valid_utf8_bytes > 3) {
+		/* If not UTF-8, try to get contents in locale encoding
+		 *  (returns valid UTF-8) */
+		s = get_string_in_locale (s);
+	} else if (n_valid_utf8_bytes < s->len) {
+		g_debug ("  Truncating to last valid UTF-8 character "
+		         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
+		         n_valid_utf8_bytes,
+		         s->len);
+		s = g_string_truncate (s, n_valid_utf8_bytes);
+	}
+
+	if (s->len < 1) {
+		g_string_free (s, TRUE);
+		return NULL;
+	}
+
+	return g_string_free (s, FALSE);
+}
+
+/**
+ * tracker_read_text_from_stream:
+ * @stream: input stream to read from
+ * @max_bytes: max number of bytes to read from @stream
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ *   convert from locale-encoding to UTF-8
+ *
+ * Reads up to @max_bytes from @stream, and validates the read text as proper
+ *  UTF-8.
+ *
+ * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_read_text_from_stream (GInputStream *stream,
+                               gsize       max_bytes,
+                               gboolean    try_locale_if_not_utf8)
+{
+	GString *s = NULL;
+	gsize n_bytes_remaining = max_bytes;
+
+	g_return_val_if_fail (stream, NULL);
+	g_return_val_if_fail (max_bytes > 0, NULL);
+
+	/* Reading in chunks of BUFFER_SIZE
+	 *   Loop is halted whenever one of this conditions is met:
+	 *     a) Read bytes reached the maximum allowed (max_bytes)
+	 *     b) No more bytes to read
+	 *     c) Error reading
+	 *     d) Stream has less than 3 bytes
+	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+	 */
+	while (n_bytes_remaining > 0) {
+		gchar buf[BUFFER_SIZE];
+		GError *error = NULL;
+		gsize n_bytes_read;
+
+		/* Read bytes from stream */
+		if (!g_input_stream_read_all (stream,
+		                              buf,
+		                              MIN (BUFFER_SIZE, n_bytes_remaining),
+		                              &n_bytes_read,
+		                              NULL,
+		                              &error)) {
+			g_message ("Error reading from stream: '%s'",
+			           error->message);
+			g_error_free (error);
+			break;
+		}
+
+		/* Process read bytes, and halt loop if needed */
+		if (!process_chunk (buf,
+		                    n_bytes_read,
+		                    BUFFER_SIZE,
+		                    &n_bytes_remaining,
+		                    &s)) {
+			break;
+		}
+	}
+
+	/* Validate UTF-8 if something was read, and return it */
+	return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+}
+
+
+/**
+ * tracker_read_text_from_fd:
+ * @fd: input fd to read from
+ * @max_bytes: max number of bytes to read from @fd
+ * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to
+ *   convert from locale-encoding to UTF-8
+ *
+ * Reads up to @max_bytes from @fd, and validates the read text as proper
+ *  UTF-8. Will also properly close the FD when finishes.
+ *
+ * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
+ **/
+gchar *
+tracker_read_text_from_fd (gint     fd,
+                           gsize    max_bytes,
+                           gboolean try_locale_if_not_utf8)
+{
+	FILE *fz;
+	GString *s = NULL;
+	gsize n_bytes_remaining = max_bytes;
+
+	g_return_val_if_fail (max_bytes > 0, NULL);
+
+	if ((fz = fdopen (fd, "r")) == NULL) {
+		g_warning ("Cannot read from FD... could not extract text");
+		close (fd);
+		return NULL;
+	}
+
+	/* Reading in chunks of BUFFER_SIZE
+	 *   Loop is halted whenever one of this conditions is met:
+	 *     a) Read bytes reached the maximum allowed (max_bytes)
+	 *     b) No more bytes to read
+	 *     c) Error reading
+	 *     d) Stream has less than 3 bytes
+	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
+	 */
+	while (n_bytes_remaining > 0) {
+		gchar buf[BUFFER_SIZE];
+		gsize n_bytes_read;
+
+		/* Read bytes */
+		n_bytes_read = fread (buf,
+		                      1,
+		                      MIN (BUFFER_SIZE, n_bytes_remaining),
+		                      fz);
+
+		/* Process read bytes, and halt loop if needed */
+		if (!process_chunk (buf,
+		                    n_bytes_read,
+		                    BUFFER_SIZE,
+		                    &n_bytes_remaining,
+		                    &s)) {
+			break;
+		}
+	}
+
+	/* Close the file here */
+	fclose (fz);
+
+	/* Validate UTF-8 if something was read, and return it */
+	return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL;
+}
diff -Nur a/src/tracker-extract/tracker-read.h b/src/tracker-extract/tracker-read.h
--- a/src/tracker-extract/tracker-read.h	1970-01-01 01:00:00.000000000 +0100
+++ b/src/tracker-extract/tracker-read.h	2011-01-04 18:25:15.716526121 +0100
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __TRACKER_READ_H__
+#define __TRACKER_READ_H__
+
+#include <glib.h>
+#include <gio/gio.h>
+
+G_BEGIN_DECLS
+
+gchar *tracker_read_text_from_stream (GInputStream *stream,
+                                      gsize         max_bytes,
+                                      gboolean      try_locale_if_not_utf8);
+
+gchar *tracker_read_text_from_fd (gint     fd,
+                                  gsize    max_bytes,
+                                  gboolean try_locale_if_not_utf8);
+
+G_END_DECLS
+
+#endif /* __TRACKER_READ_H__ */
+