Extract Website Data Using UrlFetchApp.fetch and Regex – Google Sheets

I'm trying to extract data from a list of >1000 URLs using a script that uses UrlFetchApp.fetch(url) and regex based on this article.

This is the code I'm using.

function importRegex(url, regex_string) {
  var html, content = '';
  var response = UrlFetchApp.fetch(url);
  if (response) {
    html = response.getContentText();
    if (html.length && regex_string.length) {
      var regex = new RegExp( regex_string, "i" );
      content = html.match(regex)[1];
    }
  }
  content = unescapeHTML(content);
  Utilities.sleep(1000); // avoid call limit by adding a delay
  return content;  
}

var htmlEntities = {
  cent:  '¢',
  pound: '£',
  yen:   '¥',
  euro:  '€',
  copy:  '©',
  reg:   '®',
  lt:    '<',
  gt:    '>',
  mdash: '–',
  quot:  '"',
  amp:   '&',
  apos:  '\''
};

function unescapeHTML(str) {
    return str.replace(/\&([^;]+);/g, function (entity, entityCode) {
        var match;

        if (entityCode in htmlEntities) {
            return htmlEntities[entityCode];
        } else if (match = entityCode.match(/^#x([\da-fA-F]+)$/)) {
            return String.fromCharCode(parseInt(match[1], 16));
        } else if (match = entityCode.match(/^#(\d+)$/)) {
            return String.fromCharCode(~~match[1]);
        } else {
            return entity;
        }
    });
};

and the importregex function formula I'm using is

=importRegex(A4, "<h1 class=""ch-title"".*?>(.*)<\/h1>")

It gives the following error

TypeError: Cannot read property '1' of null (line 9).

I'm not sure how to fix it.

Best Answer

First, custom functions can't be used on >1000 of formulas in a single spreadsheet because Google Apps Script limits. You might improve the custom function to make it able to process a range (an array of values) but the execution time should not exceed 30 secs.

Second, regular expressions have several limitations to parse HTML. For details see Using regular expressions to parse HTML: why not?

Regarding the error message, it's very likely that the site that you trying to scrape has measures to prevent that.

Best Answer

Related Solutions

Gmail – How to Automatically Extract Information from Emails Using Regex

Google Sheets – Query Language with Regex Matching

Related Topic