diff -Nur a/src/libtracker-extract/tracker-utils.c b/src/libtracker-extract/tracker-utils.c --- a/src/libtracker-extract/tracker-utils.c 2010-09-02 14:51:00.000000000 +0200 +++ b/src/libtracker-extract/tracker-utils.c 2011-01-04 18:27:05.301708634 +0100 @@ -233,6 +233,61 @@ } /** + * tracker_text_validate_utf8: + * @text: the text to validate + * @text_len: length of @text, or -1 if NUL-terminated + * @str: the string where to place the validated UTF-8 characters, or %NULL if + * not needed. + * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed + * + * This function iterates through @text checking for UTF-8 validity + * using g_utf8_validate(), appends the first chunk of valid characters + * to @str, and gives the number of valid UTF-8 bytes in @valid_len. + * + * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise. + * + * Since: 0.9 + **/ +gboolean +tracker_text_validate_utf8 (const gchar *text, + gssize text_len, + GString **str, + gsize *valid_len) +{ + gsize len_to_validate; + + g_return_val_if_fail (text, FALSE); + + len_to_validate = text_len >= 0 ? text_len : strlen (text); + + if (len_to_validate > 0) { + const gchar *end = text; + + /* Validate string, getting the pointer to first non-valid character + * (if any) or to the end of the string. */ + g_utf8_validate (text, len_to_validate, &end); + if (end > text) { + /* If str output required... */ + if (str) { + /* Create string to output if not already as input */ + *str = (*str == NULL ? + g_string_new_len (text, end - text) : + g_string_append_len (*str, text, end - text)); + } + + /* If utf8 len output required... */ + if (valid_len) { + *valid_len = end - text; + } + + return TRUE; + } + } + + return FALSE; +} + +/** * tracker_date_format_to_iso8601: * @date_string: the date in a string pointer * @format: the format of the @date_string diff -Nur a/src/libtracker-extract/tracker-utils.h b/src/libtracker-extract/tracker-utils.h --- a/src/libtracker-extract/tracker-utils.h 2010-09-02 14:51:00.000000000 +0200 +++ b/src/libtracker-extract/tracker-utils.h 2011-01-04 18:28:26.122960742 +0100 @@ -34,6 +34,10 @@ gchar *tracker_text_normalize (const gchar *text, guint max_words, guint *n_words); +gboolean tracker_text_validate_utf8 (const gchar *text, + gssize text_len, + GString **str, + gsize *valid_len); gchar *tracker_date_guess (const gchar *date_string); gchar *tracker_date_format_to_iso8601 (const gchar *date_string, const gchar *format); diff -Nur a/src/tracker-extract/tracker-extract-msoffice.c b/src/tracker-extract/tracker-extract-msoffice.c --- a/src/tracker-extract/tracker-extract-msoffice.c 2010-09-02 16:22:50.000000000 +0200 +++ b/src/tracker-extract/tracker-extract-msoffice.c 2011-01-04 18:25:15.706523770 +0100 @@ -2177,7 +2177,7 @@ * using the given context */ tracker_gsf_parse_xml_in_zip (parser_info->uri, xml_filename, - context); + context, NULL); g_markup_parse_context_free (context); } @@ -2335,7 +2335,7 @@ * using the given context */ tracker_gsf_parse_xml_in_zip (uri, "[Content_Types].xml", - context); + context, NULL); if (info.content) { gchar *content; diff -Nur a/src/tracker-extract/tracker-extract-oasis.c b/src/tracker-extract/tracker-extract-oasis.c --- a/src/tracker-extract/tracker-extract-oasis.c 2010-09-02 14:51:00.000000000 +0200 +++ b/src/tracker-extract/tracker-extract-oasis.c 2011-01-04 19:19:50.938606364 +0100 @@ -24,6 +24,7 @@ #include "tracker-main.h" #include "tracker-gsf.h" +#include "tracker-read.h" #include @@ -36,142 +37,126 @@ ODT_TAG_TYPE_COMMENTS, ODT_TAG_TYPE_STATS, ODT_TAG_TYPE_CREATED, - ODT_TAG_TYPE_GENERATOR + ODT_TAG_TYPE_GENERATOR, + ODT_TAG_TYPE_WORD_TEXT, + ODT_TAG_TYPE_SLIDE_TEXT, + ODT_TAG_TYPE_SPREADSHEET_TEXT } ODTTagType; +typedef enum { + FILE_TYPE_INVALID, + FILE_TYPE_ODP, + FILE_TYPE_ODT, + FILE_TYPE_ODS +} ODTFileType; + typedef struct { TrackerSparqlBuilder *metadata; ODTTagType current; const gchar *uri; gboolean title_already_set; -} ODTParseInfo; +} ODTMetadataParseInfo; -static void xml_start_element_handler (GMarkupParseContext *context, - const gchar *element_name, - const gchar **attribute_names, - const gchar **attribute_values, - gpointer user_data, - GError **error); -static void xml_end_element_handler (GMarkupParseContext *context, - const gchar *element_name, - gpointer user_data, - GError **error); -static void xml_text_handler (GMarkupParseContext *context, - const gchar *text, - gsize text_len, - gpointer user_data, - GError **error); -static void extract_oasis (const gchar *filename, - TrackerSparqlBuilder *preupdate, - TrackerSparqlBuilder *metadata); +typedef struct { + ODTTagType current; + gboolean styles_present; + ODTFileType file_type; + GString *content; + gulong bytes_pending; +} ODTContentParseInfo; + +GQuark maximum_size_error_quark = 0; + +static void xml_start_element_handler_metadata (GMarkupParseContext *context, + const gchar *element_name, + const gchar **attribute_names, + const gchar **attribute_values, + gpointer user_data, + GError **error); +static void xml_end_element_handler_metadata (GMarkupParseContext *context, + const gchar *element_name, + gpointer user_data, + GError **error); +static void xml_text_handler_metadata (GMarkupParseContext *context, + const gchar *text, + gsize text_len, + gpointer user_data, + GError **error); +static void xml_start_element_handler_content (GMarkupParseContext *context, + const gchar *element_name, + const gchar **attribute_names, + const gchar **attribute_values, + gpointer user_data, + GError **error); +static void xml_end_element_handler_content (GMarkupParseContext *context, + const gchar *element_name, + gpointer user_data, + GError **error); +static void xml_text_handler_content (GMarkupParseContext *context, + const gchar *text, + gsize text_len, + gpointer user_data, + GError **error); +static void extract_oasis (const gchar *filename, + TrackerSparqlBuilder *preupdate, + TrackerSparqlBuilder *metadata); +static void extract_oasis_content (const gchar *uri, + gulong total_bytes, + ODTFileType file_type, + TrackerSparqlBuilder *metadata); static TrackerExtractData extract_data[] = { { "application/vnd.oasis.opendocument.*", extract_oasis }, { NULL, NULL } }; - -#define ODT_BUFFER_SIZE 8193 /* bytes */ - -static gchar * -extract_oasis_content (const gchar *uri, - guint n_words, - gsize n_bytes) +static void +extract_oasis_content (const gchar *uri, + gulong total_bytes, + ODTFileType file_type, + TrackerSparqlBuilder *metadata) { - const gchar *argv[4]; - gint fdz; - FILE *fz; + gchar *content; + ODTContentParseInfo info; + GMarkupParseContext *context; GError *error = NULL; - gchar *text = NULL; - gchar *path; + GMarkupParser parser = { + xml_start_element_handler_content, + xml_end_element_handler_content, + xml_text_handler_content, + NULL, + NULL + }; - /* Newly allocated string with the file path */ - path = g_filename_from_uri (uri, NULL, NULL); + /* Create parse info */ + info.current = ODT_TAG_TYPE_UNKNOWN; + info.file_type = file_type; + info.styles_present = FALSE; + info.content = g_string_new (""); + info.bytes_pending = total_bytes; - /* Setup command to be executed */ - argv[0] = "odt2txt"; - argv[1] = "--encoding=utf-8"; - argv[2] = path; - argv[3] = NULL; - - g_debug ("Executing command:'%s %s %s' (max words: %u, " - "max_bytes: %" G_GSIZE_FORMAT ")", - argv[0], argv[1], argv[2], n_words, n_bytes); - - /* Fork & spawn */ - if (!g_spawn_async_with_pipes (g_get_tmp_dir (), - (gchar **)argv, - NULL, - G_SPAWN_SEARCH_PATH | G_SPAWN_STDERR_TO_DEV_NULL, - tracker_spawn_child_func, - GINT_TO_POINTER (10), - NULL, - NULL, - &fdz, - NULL, - &error)) { - g_warning ("Spawning failed, could not extract text from '%s': %s", - path, error ? error->message : NULL); - g_clear_error (&error); - } - /* Open file descriptor for reading */ - else if ((fz = fdopen (fdz, "r")) == NULL) { - g_warning ("Cannot read child's output... could not extract " - "text from '%s'", path); - close (fdz); - } - /* Start buffered reading... */ - else { - unsigned char buf[ODT_BUFFER_SIZE]; - size_t r, accum; - guint n_words_remaining = n_words; - GString *normalized; - - accum = 0; - normalized = g_string_new (""); - - /* Reading in chunks of ODT_BUFFER_SIZE -1 (8192) - * Loop is halted whenever one of this conditions is met: - * a) Read bytes reached the maximum allowed (n_bytes) - * b) Already read up to the max number of words configured - * c) No more bytes to read - */ - while ((accum <= n_bytes) && - (n_words_remaining > 0) && - (r = fread (buf, 1, ODT_BUFFER_SIZE-1, fz))) { - gchar *normalized_chunk; - guint n_words_normalized; - - /* Always make sure that the read string will be - * NIL-terminated */ - buf[r] = '\0'; - /* Get normalized chunk */ - normalized_chunk = tracker_text_normalize (buf, - n_words_remaining, - &n_words_normalized); - /* Update number of words remaining. - * Note that n_words_normalized should always be less or - * equal than n_words_remaining */ - n_words_remaining = (n_words_normalized <= n_words_remaining ? - n_words_remaining - n_words_normalized : 0); - /* Update accumulated */ - accum += r; - - /* Add normalized chunk to the whole normalized string */ - g_string_append (normalized, normalized_chunk); - g_free (normalized_chunk); - } + /* Create parsing context */ + context = g_markup_parse_context_new (&parser, 0, &info, NULL); - /* fclose() the stream, no need to close() the original FD */ - fclose (fz); + /* Load the internal XML file from the Zip archive, and parse it + * using the given context */ + tracker_gsf_parse_xml_in_zip (uri, "content.xml", context, &error); - /* Set final normalized contents to return */ - text = g_string_free (normalized, FALSE); + if (!error || g_error_matches (error, maximum_size_error_quark, 0)) { + content = g_string_free (info.content, FALSE); + tracker_sparql_builder_predicate (metadata, "nie:plainTextContent"); + tracker_sparql_builder_object_unvalidated (metadata, content); + } else { + g_warning ("Got error parsing XML file: %s\n", error->message); + g_string_free (info.content, TRUE); } - g_free (path); + if (error) { + g_error_free (error); + } - return text; + g_free (content); + g_markup_parse_context_free (context); } static void @@ -179,20 +164,25 @@ TrackerSparqlBuilder *preupdate, TrackerSparqlBuilder *metadata) { - gchar *content; TrackerFTSConfig *fts_config; - guint n_words; - gsize n_bytes; - ODTParseInfo info; + ODTMetadataParseInfo info; + ODTFileType file_type; + GFile *file = NULL; + GFileInfo *file_info = NULL; + const gchar *mime_used; GMarkupParseContext *context; GMarkupParser parser = { - xml_start_element_handler, - xml_end_element_handler, - xml_text_handler, + xml_start_element_handler_metadata, + xml_end_element_handler_metadata, + xml_text_handler_metadata, NULL, NULL }; + if (G_UNLIKELY (maximum_size_error_quark == 0)) { + maximum_size_error_quark = g_quark_from_static_string ("maximum_size_error"); + } + /* Setup conf */ fts_config = tracker_main_get_fts_config (); @@ -214,37 +204,62 @@ /* Load the internal XML file from the Zip archive, and parse it * using the given context */ - tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context); + tracker_gsf_parse_xml_in_zip (uri, "meta.xml", context, NULL); g_markup_parse_context_free (context); /* Next, parse contents */ + file = g_file_new_for_uri (uri); - /* Set max words to read from content */ - n_words = tracker_fts_config_get_max_words_to_index (fts_config); + if (!file) { + g_warning ("Could not create GFile for URI:'%s'", + uri); + return; + } - /* Set max bytes to read from content. - * Assuming 3 bytes per unicode point in UTF-8, as 4-byte UTF-8 unicode - * points are really pretty rare */ - n_bytes = 3 * n_words * tracker_fts_config_get_max_word_length(fts_config); + file_info = g_file_query_info (file, + G_FILE_ATTRIBUTE_STANDARD_CONTENT_TYPE, + G_FILE_QUERY_INFO_NONE, + NULL, + NULL); + g_object_unref (file); - /* Extract content with the given limitations */ - content = extract_oasis_content (uri, n_words, n_bytes); - if (content) { - tracker_sparql_builder_predicate (metadata, "nie:plainTextContent"); - tracker_sparql_builder_object_unvalidated (metadata, content); - g_free (content); + if (!file_info) { + g_warning ("Could not get GFileInfo for URI:'%s'", + uri); + return; + } + + mime_used = g_file_info_get_content_type (file_info); + + if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.text") == 0) { + file_type = FILE_TYPE_ODT; + } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.presentation") == 0) { + file_type = FILE_TYPE_ODP; + } else if (g_ascii_strcasecmp (mime_used, "application/vnd.oasis.opendocument.spreadsheet") == 0) { + file_type = FILE_TYPE_ODS; + } else { + g_message ("Mime type was not recognised:'%s'", mime_used); + file_type = FILE_TYPE_INVALID; } + + g_object_unref (file_info); + + /* Extract content with the given limitations */ + extract_oasis_content (uri, + 3 * tracker_fts_config_get_max_words_to_index (fts_config) * tracker_fts_config_get_max_word_length (fts_config), + file_type, + metadata); } static void -xml_start_element_handler (GMarkupParseContext *context, - const gchar *element_name, - const gchar **attribute_names, - const gchar **attribute_values, - gpointer user_data, - GError **error) +xml_start_element_handler_metadata (GMarkupParseContext *context, + const gchar *element_name, + const gchar **attribute_names, + const gchar **attribute_values, + gpointer user_data, + GError **error) { - ODTParseInfo *data = user_data; + ODTMetadataParseInfo *data = user_data; if (g_ascii_strcasecmp (element_name, "dc:title") == 0) { data->current = ODT_TAG_TYPE_TITLE; @@ -285,22 +300,22 @@ } static void -xml_end_element_handler (GMarkupParseContext *context, - const gchar *element_name, - gpointer user_data, - GError **error) +xml_end_element_handler_metadata (GMarkupParseContext *context, + const gchar *element_name, + gpointer user_data, + GError **error) { - ((ODTParseInfo*) user_data)->current = -1; + ((ODTMetadataParseInfo*) user_data)->current = -1; } static void -xml_text_handler (GMarkupParseContext *context, - const gchar *text, - gsize text_len, - gpointer user_data, - GError **error) +xml_text_handler_metadata (GMarkupParseContext *context, + const gchar *text, + gsize text_len, + gpointer user_data, + GError **error) { - ODTParseInfo *data; + ODTMetadataParseInfo *data; TrackerSparqlBuilder *metadata; const gchar *uri; gchar *date; @@ -378,6 +393,150 @@ break; } } + +static void +xml_start_element_handler_content (GMarkupParseContext *context, + const gchar *element_name, + const gchar **attribute_names, + const gchar **attribute_values, + gpointer user_data, + GError **error) +{ + ODTContentParseInfo *data = user_data; + const gchar **a; + const gchar **v; + + switch (data->file_type) { + case FILE_TYPE_ODT: + if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) || + (g_ascii_strcasecmp (element_name, "text:table-index") == 0) || + (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) || + (g_ascii_strcasecmp (element_name, "text:section") == 0)) { + data->styles_present = TRUE; + } else if (g_ascii_strcasecmp (element_name, "table:table-cell") == 0) { + data->current = ODT_TAG_TYPE_WORD_TEXT; + } else if (g_ascii_strcasecmp (element_name, "text:p") == 0) { + if (data->styles_present) { + data->current = ODT_TAG_TYPE_WORD_TEXT; + break; + } + + for (a = attribute_names, v = attribute_values; *a; ++a, ++v) { + if (g_ascii_strcasecmp (*a, "text:style-name") != 0) { + continue; + } + + if ((g_ascii_strcasecmp (*v, "title-article") == 0) || + (g_ascii_strcasecmp (*v, "para-padding") == 0) || + (g_ascii_strcasecmp (*v, "para-screen") == 0)) { + data->current = ODT_TAG_TYPE_WORD_TEXT; + } + } + } else if (g_ascii_strcasecmp (element_name, "text:h") == 0) { + for (a = attribute_names, v = attribute_values; *a; ++a, ++v) { + if (g_ascii_strcasecmp (*a, "text:style-name") != 0) { + continue; + } + + if (g_ascii_strncasecmp (*v, "Heading", 7) == 0) { + data->current = ODT_TAG_TYPE_WORD_TEXT; + } + } + } else if (g_ascii_strcasecmp (element_name, "text:span") == 0) { + data->current = ODT_TAG_TYPE_WORD_TEXT; + } else if ((g_ascii_strcasecmp (element_name, "text:a") == 0) || + (g_ascii_strcasecmp (element_name, "text:s") == 0)) { + data->current = ODT_TAG_TYPE_WORD_TEXT; + } else { + data->current = -1; + } + break; + + case FILE_TYPE_ODP: + data->current = ODT_TAG_TYPE_SLIDE_TEXT; + break; + + case FILE_TYPE_ODS: + if (g_ascii_strncasecmp (element_name, "text", 4) == 0) { + data->current = ODT_TAG_TYPE_SPREADSHEET_TEXT; + } else { + data->current = -1; + } + break; + + case FILE_TYPE_INVALID: + g_message ("Open Office Document type: %d invalid", data->file_type); + break; + } +} + +static void +xml_end_element_handler_content (GMarkupParseContext *context, + const gchar *element_name, + gpointer user_data, + GError **error) +{ + ODTContentParseInfo *data = user_data; + + switch (data->file_type) { + case FILE_TYPE_ODT: + if ((g_ascii_strcasecmp (element_name, "text:table-of-content") == 0) || + (g_ascii_strcasecmp (element_name, "text:table-index") == 0) || + (g_ascii_strcasecmp (element_name, "text:illustration-index") == 0) || + (g_ascii_strcasecmp (element_name, "text:section") == 0)) { + data->styles_present = FALSE; + } + break; + default: + break; + } + + if ((g_ascii_strcasecmp (element_name, "text:a") != 0) && + (g_ascii_strcasecmp (element_name, "text:s") != 0)) { + data->current = -1; + } +} + +static void +xml_text_handler_content (GMarkupParseContext *context, + const gchar *text, + gsize text_len, + gpointer user_data, + GError **error) +{ + ODTContentParseInfo *data = user_data; + gsize written_bytes = 0; + + switch (data->current) { + case ODT_TAG_TYPE_WORD_TEXT: + case ODT_TAG_TYPE_SLIDE_TEXT: + case ODT_TAG_TYPE_SPREADSHEET_TEXT: + if (data->bytes_pending == 0) { + g_set_error_literal (error, + maximum_size_error_quark, 0, + "Maximum text limit reached"); + break; + } + + /* Look for valid UTF-8 text */ + if (tracker_text_validate_utf8 (text, + MIN (text_len, data->bytes_pending), + &data->content, + &written_bytes)) { + if (data->content->str[data->content->len - 1] != ' ') { + /* If some bytes found to be valid, append an extra whitespace + * as separator */ + g_string_append_c (data->content, ' '); + } + } + + data->bytes_pending -= written_bytes; + break; + + default: + break; + } +} TrackerExtractData * tracker_extract_get_data (void) diff -Nur a/src/tracker-extract/tracker-gsf.c b/src/tracker-extract/tracker-gsf.c --- a/src/tracker-extract/tracker-gsf.c 2010-09-02 14:51:00.000000000 +0200 +++ b/src/tracker-extract/tracker-gsf.c 2011-01-04 18:25:15.709857887 +0100 @@ -77,9 +77,10 @@ * maximum size of the uncompressed XML file is limited to be to 20MBytes. */ void -tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri, - const gchar *xml_filename, - GMarkupParseContext *context) +tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri, + const gchar *xml_filename, + GMarkupParseContext *context, + GError **err) { gchar *filename; GError *error = NULL; @@ -124,7 +125,8 @@ chunk_size = MIN (remaining_size, XML_BUFFER_SIZE); accum = 0; - while (accum <= XML_MAX_BYTES_READ && + while (!error && + accum <= XML_MAX_BYTES_READ && chunk_size > 0 && gsf_input_read (GSF_INPUT (member), chunk_size, buf) != NULL) { @@ -132,7 +134,7 @@ accum += chunk_size; /* Pass the read stream to the context parser... */ - g_markup_parse_context_parse (context, buf, chunk_size, NULL); + g_markup_parse_context_parse (context, buf, chunk_size, &error); /* update bytes to be read */ remaining_size -= chunk_size; @@ -141,8 +143,9 @@ } g_free (filename); + if (error) - g_error_free (error); + g_propagate_error (err, error); if (infile) g_object_unref (infile); if (src) diff -Nur a/src/tracker-extract/tracker-gsf.h b/src/tracker-extract/tracker-gsf.h --- a/src/tracker-extract/tracker-gsf.h 2010-09-02 14:51:00.000000000 +0200 +++ b/src/tracker-extract/tracker-gsf.h 2011-01-04 18:25:15.713192004 +0100 @@ -25,9 +25,10 @@ G_BEGIN_DECLS -void tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri, - const gchar *xml_filename, - GMarkupParseContext *context); +void tracker_gsf_parse_xml_in_zip (const gchar *zip_file_uri, + const gchar *xml_filename, + GMarkupParseContext *context, + GError **error); G_END_DECLS diff -Nur a/src/tracker-extract/tracker-read.c b/src/tracker-extract/tracker-read.c --- a/src/tracker-extract/tracker-read.c 1970-01-01 01:00:00.000000000 +0100 +++ b/src/tracker-extract/tracker-read.c 2011-01-04 18:25:15.713192004 +0100 @@ -0,0 +1,279 @@ +/* + * Copyright (C) 2010, Nokia + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include +#include + +#include +#include + +#include + +#include "tracker-read.h" + +/* Size of the buffer to use when reading, in bytes */ +#define BUFFER_SIZE 65535 + +static GString * +get_string_in_locale (GString *s) +{ + GError *error = NULL; + gchar *str; + gsize bytes_read; + gsize bytes_written; + + str = g_locale_to_utf8 (s->str, + s->len, + &bytes_read, + &bytes_written, + &error); + if (error) { + g_debug (" Conversion to UTF-8 read %" G_GSIZE_FORMAT " bytes, wrote %" G_GSIZE_FORMAT " bytes", + bytes_read, + bytes_written); + g_message ("Could not convert string from locale to UTF-8, %s", + error->message); + g_error_free (error); + g_free (str); + } else { + g_string_assign (s, str); + g_free (str); + } + + return s; +} + + +/* Returns %TRUE if read operation should continue, %FALSE otherwise */ +static gboolean +process_chunk (const gchar *read_bytes, + gsize read_size, + gsize buffer_size, + gsize *remaining_size, + GString **s) +{ + /* If no more bytes to read, halt loop */ + if (read_size == 0) { + return FALSE; + } + + /* First of all, check if this is the first time we + * have tried to read the stream up to the BUFFER_SIZE + * limit. Then make sure that we read the maximum size + * of the buffer. If we don't do this, there is the + * case where we read 10 bytes in and it is just one + * line with no '\n'. Once we have confirmed this we + * check that the buffer has a '\n' to make sure the + * file is worth indexing. Similarly if the file has + * <= 3 bytes then we drop it. + */ + if (*s == NULL) { + if (read_size == buffer_size && + g_strstr_len (read_bytes, read_size, "\n") == NULL) { + g_debug (" No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, " + "not indexing file", + read_size); + return FALSE; + } else if (read_size <= 2) { + g_debug (" File has less than 3 characters in it, " + "not indexing file"); + return FALSE; + } + } + + /* Update remaining bytes */ + *remaining_size -= read_size; + + g_debug (" Read " + "%" G_GSSIZE_FORMAT " bytes from file, %" G_GSIZE_FORMAT " " + "bytes remaining until configured threshold is reached", + read_size, + *remaining_size); + + /* Append non-NIL terminated bytes */ + *s = (*s ? + g_string_append_len (*s, read_bytes, read_size) : + g_string_new_len (read_bytes, read_size)); + + return TRUE; +} + +static gchar * +process_whole_string (GString *s, + gboolean try_locale_if_not_utf8) +{ + gsize n_valid_utf8_bytes = 0; + + /* Get number of valid UTF-8 bytes found */ + tracker_text_validate_utf8 (s->str, + s->len, + NULL, + &n_valid_utf8_bytes); + + /* A valid UTF-8 file will be that where all read bytes are valid, + * with a margin of 3 bytes for the last UTF-8 character which might + * have been cut. */ + if (try_locale_if_not_utf8 && + s->len - n_valid_utf8_bytes > 3) { + /* If not UTF-8, try to get contents in locale encoding + * (returns valid UTF-8) */ + s = get_string_in_locale (s); + } else if (n_valid_utf8_bytes < s->len) { + g_debug (" Truncating to last valid UTF-8 character " + "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)", + n_valid_utf8_bytes, + s->len); + s = g_string_truncate (s, n_valid_utf8_bytes); + } + + if (s->len < 1) { + g_string_free (s, TRUE); + return NULL; + } + + return g_string_free (s, FALSE); +} + +/** + * tracker_read_text_from_stream: + * @stream: input stream to read from + * @max_bytes: max number of bytes to read from @stream + * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to + * convert from locale-encoding to UTF-8 + * + * Reads up to @max_bytes from @stream, and validates the read text as proper + * UTF-8. + * + * Returns: newly-allocated NUL-terminated UTF-8 string with the read text. + **/ +gchar * +tracker_read_text_from_stream (GInputStream *stream, + gsize max_bytes, + gboolean try_locale_if_not_utf8) +{ + GString *s = NULL; + gsize n_bytes_remaining = max_bytes; + + g_return_val_if_fail (stream, NULL); + g_return_val_if_fail (max_bytes > 0, NULL); + + /* Reading in chunks of BUFFER_SIZE + * Loop is halted whenever one of this conditions is met: + * a) Read bytes reached the maximum allowed (max_bytes) + * b) No more bytes to read + * c) Error reading + * d) Stream has less than 3 bytes + * e) Stream has a single line of BUFFER_SIZE bytes with no EOL + */ + while (n_bytes_remaining > 0) { + gchar buf[BUFFER_SIZE]; + GError *error = NULL; + gsize n_bytes_read; + + /* Read bytes from stream */ + if (!g_input_stream_read_all (stream, + buf, + MIN (BUFFER_SIZE, n_bytes_remaining), + &n_bytes_read, + NULL, + &error)) { + g_message ("Error reading from stream: '%s'", + error->message); + g_error_free (error); + break; + } + + /* Process read bytes, and halt loop if needed */ + if (!process_chunk (buf, + n_bytes_read, + BUFFER_SIZE, + &n_bytes_remaining, + &s)) { + break; + } + } + + /* Validate UTF-8 if something was read, and return it */ + return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL; +} + + +/** + * tracker_read_text_from_fd: + * @fd: input fd to read from + * @max_bytes: max number of bytes to read from @fd + * @try_locale_if_not_utf8: if the the text read is not valid UTF-8, try to + * convert from locale-encoding to UTF-8 + * + * Reads up to @max_bytes from @fd, and validates the read text as proper + * UTF-8. Will also properly close the FD when finishes. + * + * Returns: newly-allocated NUL-terminated UTF-8 string with the read text. + **/ +gchar * +tracker_read_text_from_fd (gint fd, + gsize max_bytes, + gboolean try_locale_if_not_utf8) +{ + FILE *fz; + GString *s = NULL; + gsize n_bytes_remaining = max_bytes; + + g_return_val_if_fail (max_bytes > 0, NULL); + + if ((fz = fdopen (fd, "r")) == NULL) { + g_warning ("Cannot read from FD... could not extract text"); + close (fd); + return NULL; + } + + /* Reading in chunks of BUFFER_SIZE + * Loop is halted whenever one of this conditions is met: + * a) Read bytes reached the maximum allowed (max_bytes) + * b) No more bytes to read + * c) Error reading + * d) Stream has less than 3 bytes + * e) Stream has a single line of BUFFER_SIZE bytes with no EOL + */ + while (n_bytes_remaining > 0) { + gchar buf[BUFFER_SIZE]; + gsize n_bytes_read; + + /* Read bytes */ + n_bytes_read = fread (buf, + 1, + MIN (BUFFER_SIZE, n_bytes_remaining), + fz); + + /* Process read bytes, and halt loop if needed */ + if (!process_chunk (buf, + n_bytes_read, + BUFFER_SIZE, + &n_bytes_remaining, + &s)) { + break; + } + } + + /* Close the file here */ + fclose (fz); + + /* Validate UTF-8 if something was read, and return it */ + return s ? process_whole_string (s, try_locale_if_not_utf8) : NULL; +} diff -Nur a/src/tracker-extract/tracker-read.h b/src/tracker-extract/tracker-read.h --- a/src/tracker-extract/tracker-read.h 1970-01-01 01:00:00.000000000 +0100 +++ b/src/tracker-extract/tracker-read.h 2011-01-04 18:25:15.716526121 +0100 @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2010, Nokia + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef __TRACKER_READ_H__ +#define __TRACKER_READ_H__ + +#include +#include + +G_BEGIN_DECLS + +gchar *tracker_read_text_from_stream (GInputStream *stream, + gsize max_bytes, + gboolean try_locale_if_not_utf8); + +gchar *tracker_read_text_from_fd (gint fd, + gsize max_bytes, + gboolean try_locale_if_not_utf8); + +G_END_DECLS + +#endif /* __TRACKER_READ_H__ */ +