Google's site search is currently $5/1,000 searches. This allows you to get search results back as JSON so you can easily display it on your website. This seems a bit high since you could get the same functionality for $3/1,000,000 requests using an Amazon Serverless Lambda. Lambdas can be written in NodeJS. Another trick is to just use your regular NodeJS server to handle site search for basically free.
The first step is to go to the project directory and install the request package for Node.
npm install request
Using request, you can just get the content of any URL. This means you can get result pages from google too. Using a few string prototype methods, you can easily parse the returned html.
Because the request is asynchronous, it cannot return a value. Instead, the developer must pass in a callback to handle the results.
Now for the implementation... I won't include the express routes and server. We'll assume you have your own plans for an endpoint. Here's the code:
function mySiteSearch(phrase, site, callback){
if (typeof(site)=="function"){
// callback is supposed to be a function
if (typeof(callback)!="function"){
callback = site;
}
site = "chriswirz.com"; // I search my site by default
}
if (typeof(site)!="string"){
site = "chriswirz.com";
}
var results = [];
// Define a splitWord method
if (!String.prototype.splitWord){
String.prototype.splitWord = function(word){
var arr = [];
try {
var index = 1;
var str = this;
while (str.length > 0 && index >= 0) {
index = str.indexOf(word);
if (index >= 0){
arr.push(str.slice(0, index));
str = str.slice(index + word.length);
}
}
if (str.length > 0){
arr.push(str);
}
}
catch (err){}
return arr;
};
}
// Define a contains method
if (!String.prototype.contains){
String.prototype.contains = function(word){
return this.indexOf(word) > -1;
};
}
// Define a startsWith method
if (!String.prototype.startsWith){
String.prototype.startsWith = function(word){
return this.indexOf(word) == 0;
};
}
// Define a replaceAll method
if (!String.prototype.replaceAll){
String.prototype.replaceAll = function(search, replacement) {
return this.splitWord(search).join(replacement);
};
}
phrase = phrase.replaceAll(" ", "+").trim();
var request = require('request');
request('http://www.google.com/search?q=site%3A' + site + '+' + phrase,
function (error, response, body) {
if (error) {
return {"error": error};
}
// Define a last() method for an array
if (!Array.prototype.last){
Array.prototype.last = function(){
return this[this.length - 1];
};
}
// Define a first() method for an array
if (!Array.prototype.first){
Array.prototype.first = function(){
return this[0];
};
}
var splits = body.splitWord("<a href=\"/url?q=");
if (splits.length < 2) { return {"error" : "No results"}; }
var begin = "<div class=\"";
for (i = 1; i < splits.length; i++) {
if (splits[i].contains("<div class=\"")){
begin = splits[i].splitWord("<div class=\"").last();
break;
}
}
splits = body.splitWord(begin);
if (splits.length < 3) { return {"error" : "Not enough results"}; }
// remove the first
splits = splits.slice(1);
// Fix the trailing HTML on the last
var divs = splits[1]
.splitWord("<div")
.length - 1;
splits[splits.length - 1] = splits[splits.length - 1]
.splitWord("</div>")
.slice(0, divs - 1)
.join("</div>");
for (var i in splits)
{
var r = {};
try {
var str = (splits[i] + "").trim();
if (str.startsWith("<a href=\"/search?q=") || str.length < 15){
continue;
}
r.HTML = begin + str;
var sp = r.HTML.splitWord("<a href=\"/url?q=");
// Check if this is a search result
// if not, carry on
if (sp.length < 2) { continue; }
// Get the URL
r.URL = sp[1].splitWord("&sa=U&")[0];
// Get the title
r.Title = sp[1].splitWord("\">")[1].splitWord("</a>")[0];
// Don't display the image panel
if (r.Title.startsWith("<img style=")) { continue; }
// Get the description
r.Description = sp[1]
.splitWord("<span class=\"st\">")
.last()
.splitWord("</span>")
.first();
// Get the data and updated description
if (r.Description.contains("<b>...</b>")) {
r.Date = Date(r.Description.splitWord("<b>...</b>")[0].trim());
r.Description = r.Description.splitWord("<b>...</b>")[1];
}
if (str.contains("/url?q=http://webcache.googleusercontent.com")
&& str.contains("Cached</a>")) {
// Get the cached URL
r.CachedURL = "http://webcache.googleusercontent.com" +
str.splitWord("/url?q=http://webcache.googleusercontent.com")[1]
.splitWord("\">")[0]
.splitWord("%252B")[0];
}
results.push(r);
}
catch (err){
console.log(err);
console.log(r);
}
}
if (typeof(callback) == "function"){
callback(results);
}
}
);
}
Now I can easily perform site search on chriswirz.com.
mySiteSearch("xml", "chriswirz.com", function(results){
for (var i in results){
var result = results[i];
console.log(result);
}
});
mySiteSearch("xml", function(results){
for (var i in results){
var result = results[i];
console.log(result);
}
});