Free Google SiteSearch with NodeJS

Google's site search is currently $5/1,000 searches. This allows you to get search results back as JSON so you can easily display it on your website. This seems a bit high since you could get the same functionality for $3/1,000,000 requests using an Amazon Serverless Lambda. Lambdas can be written in NodeJS. Another trick is to just use your regular NodeJS server to handle site search for basically free.

Note: Google frequently changes the html in their search page, so this trick has to be updated. It's probably more effective just to pay for site search.

The first step is to go to the project directory and install the request package for Node.


npm install request

Using request, you can just get the content of any URL. This means you can get result pages from google too. Using a few string prototype methods, you can easily parse the returned html.

Because the request is asynchronous, it cannot return a value. Instead, the developer must pass in a callback to handle the results.

Now for the implementation... I won't include the express routes and server. We'll assume you have your own plans for an endpoint. Here's the code:


function mySiteSearch(phrase, site, callback){
	if (typeof(site)=="function"){
		// callback is supposed to be a function
		if (typeof(callback)!="function"){
			callback = site;
		}
		site = "chriswirz.com"; // I search my site by default
	}
	if (typeof(site)!="string"){
		site = "chriswirz.com";
	}
	
	var results = [];
	
	// Define a splitWord method
	if (!String.prototype.splitWord){
		String.prototype.splitWord = function(word){
			var arr = [];
			try {
				var index = 1;
				var str = this;
				while (str.length > 0 && index >= 0) {
					index = str.indexOf(word);
					if (index >= 0){
						arr.push(str.slice(0, index));
						str = str.slice(index + word.length);
					}
				}
				if (str.length > 0){
					arr.push(str);
				}
			}
			catch (err){}
			return arr;
		};
	}
	
	// Define a contains method
	if (!String.prototype.contains){
		String.prototype.contains = function(word){
			return this.indexOf(word) > -1;
		};
	}
	
	// Define a startsWith method
	if (!String.prototype.startsWith){
		String.prototype.startsWith = function(word){
			return this.indexOf(word) == 0;
		};
	}
	
	
	// Define a replaceAll method
	if (!String.prototype.replaceAll){
		String.prototype.replaceAll = function(search, replacement) {
			return this.splitWord(search).join(replacement);
		};
	}
	phrase = phrase.replaceAll(" ", "+").trim();

	var request = require('request');
	request('http://www.google.com/search?q=site%3A' + site + '+' + phrase,
		function (error, response, body) {
			if (error) {
			  return {"error": error};
			}
			
			// Define a last() method for an array
			if (!Array.prototype.last){
				Array.prototype.last = function(){
					return this[this.length - 1];
				};
			}
			
			// Define a first() method for an array
			if (!Array.prototype.first){
				Array.prototype.first = function(){
					return this[0];
				};
			}
			
			var splits = body.splitWord("<a href=\"/url?q=");
			if (splits.length < 2) { return {"error" : "No results"}; }

			var begin = "<div class=\"";
			for (i = 1; i < splits.length; i++) {
            	if (splits[i].contains("<div class=\"")){
					begin = splits[i].splitWord("<div class=\"").last();
					break;
				}
			}
			splits = body.splitWord(begin);
			if (splits.length < 3) { return {"error" : "Not enough results"}; }

			// remove the first
			splits = splits.slice(1);
			
			// Fix the trailing HTML on the last
			var divs = splits[1]
				.splitWord("<div")
				.length - 1;
			splits[splits.length - 1] = splits[splits.length - 1]
				.splitWord("</div>")
				.slice(0, divs - 1)
				.join("</div>");

			
			for (var i in splits)
			{
				var r = {}; 
				try {
					var str = (splits[i] + "").trim();
					if (str.startsWith("<a href=\"/search?q=") || str.length < 15){
						continue;
					}
					r.HTML = begin + str;
					var sp = r.HTML.splitWord("<a href=\"/url?q=");
					
					// Check if this is a search result
					// if not, carry on
					if (sp.length < 2) { continue; }

					// Get the URL
					r.URL = sp[1].splitWord("&sa=U&")[0];

					// Get the title
					r.Title = sp[1].splitWord("\">")[1].splitWord("</a>")[0];

					// Don't display the image panel
					if (r.Title.startsWith("<img style=")) { continue; }

					// Get the description
					r.Description = sp[1]
						.splitWord("<span class=\"st\">")
						.last()
						.splitWord("</span>")
						.first();
					
					// Get the data and updated description
					if (r.Description.contains("<b>...</b>")) {
						r.Date = Date(r.Description.splitWord("<b>...</b>")[0].trim());
						r.Description = r.Description.splitWord("<b>...</b>")[1];
					}

					if (str.contains("/url?q=http://webcache.googleusercontent.com") 
						&& str.contains("Cached</a>")) {
						// Get the cached URL
						r.CachedURL = "http://webcache.googleusercontent.com" + 
							str.splitWord("/url?q=http://webcache.googleusercontent.com")[1]
								.splitWord("\">")[0]
								.splitWord("%252B")[0];
					}
					
					results.push(r);
				}
				catch (err){
					console.log(err);
					console.log(r);
				}	
			}
			
			if (typeof(callback) == "function"){
				callback(results);
			}
		}
	);
}

Now I can easily perform site search on chriswirz.com.


mySiteSearch("xml", "chriswirz.com", function(results){
	for (var i in results){
		var result = results[i];
		console.log(result);
	}
});
mySiteSearch("xml", function(results){
	for (var i in results){
		var result = results[i];
		console.log(result);
	}
});