I listen to a podcast called Crap from the Past. It’s a radio show that airs weekly in Minneapolis and specializes in playing music from my childhood. The show has been on the air for ~26 years. I sometimes want to know if they’ve ever played a particular song, but searching involves going through one of 26 static web pages. (Or, you know, asking Google.) I decided to write my first real Swift program to scrape their pages and put the shows into a CSV file I could look at in Numbers.
// // main.swift // CrapScraper // import Foundation func getPage(address: URL) -> String { let ephemeralConfiguration = URLSessionConfiguration.ephemeral let ephemeralSession = URLSession(configuration: ephemeralConfiguration, delegate:nil, delegateQueue:nil) var done = false; var result = "" let task = ephemeralSession.dataTask(with: address) { (data, response, error) in if let error = error { print ("error: \(error)") } else if let data = data, let string = String(data: data, encoding: .utf8) { result = string } done = true; } task.resume() // Busy wait until we get a result while (!done){ } return result } func escapeString(str : inout String) { // Because Numbers and Excel don't actually handle all of RFC4180, we can't just // put quotes around the string and escape the inner quotes to handle commas. We // need to actually replace them. This changes the data which is not great, but // we don't have a lot of choice here. str = str.replacingOccurrences(of: ",", with: ";", options: .regularExpression) } func extractDate(scanner: Scanner) -> String { var urlPrefix = "http://www.archive.org/details/cftp-" let scannerStart = scanner.scanLocation var nsScannedData : NSString? if !scanner.scanUpTo(urlPrefix, into: &nsScannedData) { return "" } if scanner.isAtEnd { scanner.scanLocation = scannerStart let secureURLPrefix = "https://www.archive.org/details/cftp-" urlPrefix = secureURLPrefix if !scanner.scanUpTo(urlPrefix, into: &nsScannedData) { return "" } } if !scanner.scanString(urlPrefix, into: nil) { return "" } var dateStr = "" var nsDateStr : NSString? if !scanner.scanUpTo("\"", into: &nsDateStr) { return "" } dateStr = nsDateStr!.substring(from: 0) escapeString(str: &dateStr) return dateStr } func extractShowName(scanner: Scanner) -> String { // The show name may be blank in some cases // If there is a title, it usually starts with ": " which // we want to remove let anchorEnd = "</a>" if !scanner.scanUpTo(anchorEnd, into: nil) { return "" } if !scanner.scanString(anchorEnd, into: nil) { return "" } let headingEnd = "</h2>" var nsShowName : NSString? if !scanner.scanUpTo(headingEnd, into: &nsShowName) { return "" } var showName = nsShowName!.substring(from: 0) if showName.starts(with: ": ") { showName.removeFirst(2) } escapeString(str: &showName) return showName; } func extractSongs(scanner: Scanner) -> Array<String> { var songs : Array<String> = [] let indent = "<p class=\"indent\">" if !scanner.scanUpTo(indent, into: nil) { return songs } if !scanner.scanString(indent, into: nil) { return songs } var nsSongInfo : NSString? let paragraphEnd = "</p>" if !scanner.scanUpTo(paragraphEnd, into: &nsSongInfo) { return songs } if !scanner.scanString(paragraphEnd, into: nil) { return songs } let songInfo = nsSongInfo!.substring(from: 0) let songScanner = Scanner(string: songInfo) let breakTag = "<br>" while !songScanner.isAtEnd { var nsNextSong : NSString? if !songScanner.scanUpTo(breakTag, into: &nsNextSong) { return songs } var nextSong = nsNextSong!.substring(from: 0) escapeString(str: &nextSong) songs.append(nextSong) if !songScanner.scanString(breakTag, into: nil) { return songs } } return songs } func analyzePage(page: String) -> String { let scanner = Scanner(string: page) // Sanity check - make sure we've at least got an HTML body tag let body = "<body>" if !scanner.scanUpTo(body, into: nil) { return "" } if !scanner.scanString(body, into: nil) { return "" } // Start scanning for the <h2> sections let h2 = "<h2>" var csv = "" while (scanner.scanUpTo(h2, into: nil)) { // Read past the <h2>. Note that at the end, scanUpTo() will // return true if it reads to the end of the file and then // scanString() will fail. if !scanner.scanString(h2, into: nil) { if scanner.isAtEnd { return csv } return "" } let dateStr = extractDate(scanner: scanner) let showName = extractShowName(scanner: scanner) let songs = extractSongs(scanner: scanner) let songList = songs.joined(separator: ", ") csv.append(dateStr) csv.append(", ") csv.append(showName) csv.append(", ") csv.append(songList) csv.append("\n") } return csv } var year : Int print ("\"date\", \"Show Name\", \"Song Titles\"\n") for year in 1992...2018 { let nextAddress = URL(string: "http://crapfromthepast.com/playlists/\(year).htm") let nextPage = getPage(address: nextAddress!) if (nextPage != "") { let analyzedPage = analyzePage(page: nextPage) if analyzedPage != "" { print ("\(analyzedPage)") } } }
Please note that running this will actually grab the pages from the website, so please be respectful and don’t hammer them!