Learning Scala

Spent a couple of hours playing with Scala. Very detailed type system, and (as a consequence) very complex syntax. But I was able to put together in a short time something that does a real job – a web crawler. The crawler can parallely fetch as many sites as you want. An actor takes care of each site. An ExecutorService in the actor takes care of each document. Pretty fast. Probably works for any crawl job that don’t have many corner cases. Has a dependency on the Jericho HTML Parser. The complete code is listed below. (Disclaimer: This is just a finger exercise in a language that I newly learned. Use at your own risk. Please don’t report bugs :–).)

// Compile: 
// > scalac -cp e:/jericho-html-3.1/dist/jericho-html-3.1.jar Crawler.scala
// Run:
// > scala -cp e:/jericho-html-3.1/dist/jericho-html-3.1.jar;. Crawler http://somesite1.com http://somesite2.com ...

import java.net._
import java.io._
import java.util.ArrayList
import java.util.concurrent._
import scala.actors.Actor

import net.htmlparser.jericho._

object CrawlState {
  private var crawledLinks = List[String]()

  def isCrawled(link: String): Boolean = {
    if (!(crawledLinks contains(link))) {
      crawledLinks = crawledLinks :+ link
      return false
    }
    return true
  }
}

class CrawlerJob(targetUrl: String) extends Actor {

 class FutureInfo(surl: String, sout: ByteArrayOutputStream) {
    def url(): String = {
      surl
    }
    def out(): ByteArrayOutputStream = {
      sout
    }
 }

 private var futures = new scala.collection.immutable.Stack[java.util.concurrent.Future[FutureInfo]]
 private val executor = Executors.newCachedThreadPool()

 class UrlFetcher(surl: String) extends Callable[FutureInfo] {
    def call(): FutureInfo = {
      println("fetching " + surl)
      val url = new URL(surl)
      val con: HttpURLConnection = url.openConnection.asInstanceOf[HttpURLConnection]
      val in = new BufferedInputStream(con getInputStream)
      val buffer = new Array[Byte](1024)
      var read = in.read(buffer)
      val out = new ByteArrayOutputStream()
      while (read != -1) {
        for (b <- buffer)
          out write(b)
        read = in.read(buffer)
      }
      in.close
      new FutureInfo(surl, out)
    }
  }

  def act() {
    crawl(targetUrl)
    while (futures.nonEmpty) {
      val future = futures.top
      futures = futures.pop
      try {
        val finfo = future.get(200, TimeUnit.MILLISECONDS)
        handle(finfo)
      } catch {
        case to_e: TimeoutException => futures = futures.push(future)
        case e: Exception => println(e getMessage)
      }
    }
  }

  def crawl(urlToCrawl: String) = {
    if (crawlable (urlToCrawl)) {  
  val surl = escapeSpaces(urlToCrawl)
  val future = executor.submit(new UrlFetcher(surl))
  futures = futures.push(future)
    }
 }

 def handle(finfo: FutureInfo) = {
    val surl = finfo url
    val out = finfo out
    val folderAndFile = extractFolderAndFile(surl)
    val isHtml = folderAndFile.last.contains(".htm")
    val file = createFile(folderAndFile)
    saveToFile(file, out)

    var links = new ArrayList[String]
    if (isHtml) {
      val source = new Source(new ByteArrayInputStream(out toByteArray))
      addLinks(source, HTMLElementName.A, links)
      addLinks(source, HTMLElementName.IMG, links)
    }

    val c = links size
    var i = 0
    while (i < c) {
      val link = links.get(i)
      if (link != null && !(CrawlState.isCrawled(link))) {
        var canCrawl = ((link startsWith(targetUrl)) || (link startsWith("."))
                        || (link startsWith("/")));
       if (!canCrawl) {
          canCrawl = !(link contains("://"))
       }
       if (canCrawl) {
           val tlink = linkify(removeFile(surl), link)   
           crawl(tlink)
       }
     }
     i = i + 1
   }
 }

 private def crawlable(url: String): Boolean = {
   !((url.contains("#") || url.contains("mailto:")
 || url.contains("<") || url.contains(">")
 || url.contains("=") || url.contains(".cgi")
 || url.contains(".php"))) // && so on ...
}

private def escapeSpaces(url: String): String = {
   url.replace(" ", "%20")
}

private def removeFile(url: String): String = {
   var idx = url.indexOf("://") + 3    
   if (idx > 0) {
   val prefix = url substring(0, idx)
   var suffix = url substring(idx)
   idx = suffix lastIndexOf("/")
   if (idx > 0) {
     suffix = suffix substring(0, idx)
   }
   return prefix + suffix
 }
 return url
}

private def linkify(targetUrl: String, link: String): String = {
  if ((link contains("://")))
    return link
  var mlink = link
  if ((link startsWith(".")))
    mlink = link substring(1)
  if (!((mlink startsWith("/"))))
    mlink = "/" + mlink
  return targetUrl + mlink
}

private def addLinks(source: Source, elemName: String,
                    linksList: ArrayList[String]) = {
   var links = source getAllElements(elemName)
   var count = links size
   var i = 0      
   var key = "href"
   if (elemName == HTMLElementName.IMG)
     key = "src"
   while (i < count) {
     linksList add(links get(i) getAttributeValue(key))
     i = i + 1
   }
}

private def extractFolderAndFile(surl: String): List[String] = {
   var file = "index.html"
   var start = (surl.indexOf("://")) + 3    
   var idx = surl.indexOf("/", start)
    if (idx < 0)
      return List(surl.substring(start).replace(":", "_"), file)
   var folder = surl.substring(start, idx)
   while (idx > 0) {
      start = idx
      idx = surl.indexOf("/", start + 1)
       if (idx > 0)
        folder = folder + surl.substring(start, idx)
   }

  if (((surl.length) - start) > 0) {
    file = surl substring(start + 1, surl.length)
  }
  if ((file length) == 0)
     file = "index.html"
  return List(folder.replace(":", "_"), file)
 }

 private def createFile(folderAndFile: List[String]): File = {
  val folder: File = new File(folderAndFile head)
  if (!folder.exists()) {
    folder.mkdirs()
  }
  return new File(folder + "/" + folderAndFile.last)
 }
 private def saveToFile(file: File, buffer: ByteArrayOutputStream) = {
   val fout = new FileOutputStream(file)
   fout.write(buffer toByteArray)
 }
}

object Crawler {
 def main(args: Array[String]) = {
   args foreach(targetUrl => {
    val crawler = new CrawlerJob(targetUrl)
    crawler start
  }) 
 }
}