Learning Scala
Spent a couple of hours playing with Scala. Very detailed type system, and (as a consequence) very complex syntax. But I was able to put together in a short time something that does a real job – a web crawler. The crawler can parallely fetch as many sites as you want. An actor takes care of each site. An ExecutorService in the actor takes care of each document. Pretty fast. Probably works for any crawl job that don’t have many corner cases. Has a dependency on the Jericho HTML Parser. The complete code is listed below. (Disclaimer: This is just a finger exercise in a language that I newly learned. Use at your own risk. Please don’t report bugs :–).)
// Compile: // > scalac -cp e:/jericho-html-3.1/dist/jericho-html-3.1.jar Crawler.scala // Run: // > scala -cp e:/jericho-html-3.1/dist/jericho-html-3.1.jar;. Crawler http://somesite1.com http://somesite2.com ... import java.net._ import java.io._ import java.util.ArrayList import java.util.concurrent._ import scala.actors.Actor import net.htmlparser.jericho._ object CrawlState { private var crawledLinks = List[String]() def isCrawled(link: String): Boolean = { if (!(crawledLinks contains(link))) { crawledLinks = crawledLinks :+ link return false } return true } } class CrawlerJob(targetUrl: String) extends Actor { class FutureInfo(surl: String, sout: ByteArrayOutputStream) { def url(): String = { surl } def out(): ByteArrayOutputStream = { sout } } private var futures = new scala.collection.immutable.Stack[java.util.concurrent.Future[FutureInfo]] private val executor = Executors.newCachedThreadPool() class UrlFetcher(surl: String) extends Callable[FutureInfo] { def call(): FutureInfo = { println("fetching " + surl) val url = new URL(surl) val con: HttpURLConnection = url.openConnection.asInstanceOf[HttpURLConnection] val in = new BufferedInputStream(con getInputStream) val buffer = new Array[Byte](1024) var read = in.read(buffer) val out = new ByteArrayOutputStream() while (read != -1) { for (b <- buffer) out write(b) read = in.read(buffer) } in.close new FutureInfo(surl, out) } } def act() { crawl(targetUrl) while (futures.nonEmpty) { val future = futures.top futures = futures.pop try { val finfo = future.get(200, TimeUnit.MILLISECONDS) handle(finfo) } catch { case to_e: TimeoutException => futures = futures.push(future) case e: Exception => println(e getMessage) } } } def crawl(urlToCrawl: String) = { if (crawlable (urlToCrawl)) { val surl = escapeSpaces(urlToCrawl) val future = executor.submit(new UrlFetcher(surl)) futures = futures.push(future) } } def handle(finfo: FutureInfo) = { val surl = finfo url val out = finfo out val folderAndFile = extractFolderAndFile(surl) val isHtml = folderAndFile.last.contains(".htm") val file = createFile(folderAndFile) saveToFile(file, out) var links = new ArrayList[String] if (isHtml) { val source = new Source(new ByteArrayInputStream(out toByteArray)) addLinks(source, HTMLElementName.A, links) addLinks(source, HTMLElementName.IMG, links) } val c = links size var i = 0 while (i < c) { val link = links.get(i) if (link != null && !(CrawlState.isCrawled(link))) { var canCrawl = ((link startsWith(targetUrl)) || (link startsWith(".")) || (link startsWith("/"))); if (!canCrawl) { canCrawl = !(link contains("://")) } if (canCrawl) { val tlink = linkify(removeFile(surl), link) crawl(tlink) } } i = i + 1 } } private def crawlable(url: String): Boolean = { !((url.contains("#") || url.contains("mailto:") || url.contains("<") || url.contains(">") || url.contains("=") || url.contains(".cgi") || url.contains(".php"))) // && so on ... } private def escapeSpaces(url: String): String = { url.replace(" ", "%20") } private def removeFile(url: String): String = { var idx = url.indexOf("://") + 3 if (idx > 0) { val prefix = url substring(0, idx) var suffix = url substring(idx) idx = suffix lastIndexOf("/") if (idx > 0) { suffix = suffix substring(0, idx) } return prefix + suffix } return url } private def linkify(targetUrl: String, link: String): String = { if ((link contains("://"))) return link var mlink = link if ((link startsWith("."))) mlink = link substring(1) if (!((mlink startsWith("/")))) mlink = "/" + mlink return targetUrl + mlink } private def addLinks(source: Source, elemName: String, linksList: ArrayList[String]) = { var links = source getAllElements(elemName) var count = links size var i = 0 var key = "href" if (elemName == HTMLElementName.IMG) key = "src" while (i < count) { linksList add(links get(i) getAttributeValue(key)) i = i + 1 } } private def extractFolderAndFile(surl: String): List[String] = { var file = "index.html" var start = (surl.indexOf("://")) + 3 var idx = surl.indexOf("/", start) if (idx < 0) return List(surl.substring(start).replace(":", "_"), file) var folder = surl.substring(start, idx) while (idx > 0) { start = idx idx = surl.indexOf("/", start + 1) if (idx > 0) folder = folder + surl.substring(start, idx) } if (((surl.length) - start) > 0) { file = surl substring(start + 1, surl.length) } if ((file length) == 0) file = "index.html" return List(folder.replace(":", "_"), file) } private def createFile(folderAndFile: List[String]): File = { val folder: File = new File(folderAndFile head) if (!folder.exists()) { folder.mkdirs() } return new File(folder + "/" + folderAndFile.last) } private def saveToFile(file: File, buffer: ByteArrayOutputStream) = { val fout = new FileOutputStream(file) fout.write(buffer toByteArray) } } object Crawler { def main(args: Array[String]) = { args foreach(targetUrl => { val crawler = new CrawlerJob(targetUrl) crawler start }) } }