diff --git a/lib/jekyll/migrators/tumblr.rb b/lib/jekyll/migrators/tumblr.rb index d7cb3969..367a83c9 100644 --- a/lib/jekyll/migrators/tumblr.rb +++ b/lib/jekyll/migrators/tumblr.rb @@ -1,119 +1,195 @@ require 'rubygems' -require 'nokogiri' require 'open-uri' require 'fileutils' -require 'CGI' -require 'iconv' +require 'nokogiri' require 'date' +require 'json' +require 'uri' +require 'jekyll' module Jekyll module Tumblr - def self.process(url, grab_images = false) - current_page = 0 - - while true - f = open(url + "/api/read?num=50&start=#{current_page * 50}") - doc = Nokogiri::HTML(Iconv.conv("utf-8", f.charset, f.readlines.join("\n"))) - - puts "Page: #{current_page + 1} - Posts: #{(doc/:tumblr/:posts/:post).size}" - - FileUtils.mkdir_p "_posts/tumblr" - - (doc/:tumblr/:posts/:post).each do |post| - title = "" - content = nil - name = nil - - if post['type'] == "regular" - title_element = post.at("regular-title") - title = title_element.inner_text unless title_element == nil - content = CGI::unescapeHTML post.at("regular-body").inner_html unless post.at("regular-body") == nil - elsif post['type'] == "link" - title = post.at("link-text").inner_html unless post.at("link-text") == nil - - if post.at("link-text") != nil - content = "#{post.at("link-text").inner_html}" - else - content = "#{post.at("link-url").inner_html}" - end - - content << "
" + CGI::unescapeHTML(post.at("link-description").inner_html) unless post.at("link-description") == nil - elsif post['type'] == "photo" - content = "" - - if post.at("photo-link-url") != nil - content = "" - else - content = "" - end - - if post.at("photo-caption") != nil - content << "
" unless content == nil - content << CGI::unescapeHTML(post.at("photo-caption").inner_html) - end - elsif post['type'] == "audio" - content = CGI::unescapeHTML(post.at("audio-player").inner_html) - content << CGI::unescapeHTML(post.at("audio-caption").inner_html) unless post.at("audio-caption") == nil - elsif post['type'] == "quote" - content = "
" + CGI::unescapeHTML(post.at("quote-text").inner_html) + "
" - content << "—" + CGI::unescapeHTML(post.at("quote-source").inner_html) unless post.at("quote-source") == nil - elsif post['type'] == "conversation" - title = post.at("conversation-title").inner_html unless post.at("conversation-title") == nil - content = "
" - - (post/:conversation/:line).each do |line| - content << "
" + line['label'] + "
" + line.inner_html + "
" unless line['label'] == nil || line == nil - end - - content << "
" - elsif post['type'] == "video" - title = post.at("video-title").inner_html unless post.at("video-title") == nil - content = CGI::unescapeHTML(post.at("video-player").inner_html) - content << CGI::unescapeHTML(post.at("video-caption").inner_html) unless post.at("video-caption") == nil - end # End post types - - name = "#{Date.parse(post['date']).to_s}-#{post['id'].downcase.gsub(/[^a-z0-9]/, '-')}.html" - - if title != nil || content != nil && name != nil - File.open("_posts/tumblr/#{name}", "w") do |f| - - f.puts <<-HEADER ---- -layout: post -title: #{title} ---- - -HEADER - - f.puts content - end # End file - end - - end # End post XML - - if (doc/:tumblr/:posts/:post).size < 50 - break - else - current_page = current_page + 1 + def self.process(url, format = "html", grab_images = false, + add_highlights = false, rewrite_urls = true) + @grab_images = grab_images + FileUtils.mkdir_p "_posts/tumblr" + url += "/api/read/json/" + per_page = 50 + posts = [] + # Two passes are required so that we can rewrite URLs. + # First pass builds up an array of each post as a hash. + begin + current_page = (current_page || -1) + 1 + feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}") + json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars. + blog = JSON.parse(json) + puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" + posts += blog["posts"].map { |post| post_to_hash(post, format) } + end until blog["posts"].size < per_page + # Rewrite URLs and create redirects. + posts = rewrite_urls_and_redirects posts if rewrite_urls + # Second pass for writing post files. + posts.each do |post| + if format == "md" + post[:content] = html_to_markdown post[:content] + post[:content] = add_syntax_highlights post[:content] if add_highlights end - - end # End while loop - end # End method + File.open("_posts/tumblr/#{post[:name]}", "w") do |f| + f.puts post[:header].to_yaml + "---\n" + post[:content] + end + end + end private - def self.save_file(url, grab_image = false) - unless grab_image == false - FileUtils.mkdir_p "tumblr_files" - - File.open("tumblr_files/#{url.split('/').last}", "w") do |f| - f.write(open(url).read) - end - - return "/tumblr_files/#{url.split('/').last}" - else - return url + # Converts each type of Tumblr post to a hash with all required + # data for Jekyll. + def self.post_to_hash(post, format) + case post['type'] + when "regular" + title = post["regular-title"] + content = post["regular-body"] + when "link" + title = post["link-text"] || post["link-url"] + content = "#{title}" + unless post["link-description"].nil? + content << "
" + post["link-description"] + end + when "photo" + title = post["photo-caption"] + max_size = post.keys.map{ |k| k.gsub("photo-url-", "").to_i }.max + url = post["photo-url"] || post["photo-url-#{max_size}"] + ext = "." + post[post.keys.select { |k| + k =~ /^photo-url-/ && post[k].split("/").last =~ /\./ + }.first].split(".").last + content = "" + unless post["photo-link-url"].nil? + content = "#{content}" + end + when "audio" + if !post["id3-title"].nil? + title = post["id3-title"] + content = post.at["audio-player"] + "
" + post["audio-caption"] + else + title = post["audio-caption"] + content = post.at["audio-player"] + end + when "quote" + title = post["quote-text"] + content = "
#{post["quote-text"]}
" + unless post["quote-source"].nil? + content << "—" + post["quote-source"] + end + when "conversation" + title = post["conversation-title"] + content = "
" + post["conversation"]["line"].each do |line| + content << "
#{line['label']}
#{line}
" + end + content << "
" + when "video" + title = post["video-title"] + content = post["video-player"] + unless post["video-caption"].nil? + content << "
" + post["video-caption"] + end end + date = Date.parse(post['date']).to_s + title = Nokogiri::HTML(title).text + slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') + { + :name => "#{date}-#{slug}.#{format}", + :header => { + "layout" => "post", + "title" => title, + "tags" => post["tags"], + }, + :content => content, + :url => post["url"], + :slug => post["url-with-slug"], + } + end + + # Create a Hash of old urls => new urls, for rewriting and + # redirects, and replace urls in each post. Instantiate Jekyll + # site/posts to get the correct permalink format. + def self.rewrite_urls_and_redirects(posts) + site = Jekyll::Site.new(Jekyll.configuration({})) + dir = File.join(File.dirname(__FILE__), "..") + urls = Hash[posts.map { |post| + # Create an initial empty file for the post so that + # we can instantiate a post object. + File.open("_posts/tumblr/#{post[:name]}", "w") + tumblr_url = URI.parse(post[:slug]).path + jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url + redirect_dir = tumblr_url.sub(/\//, "") + "/" + FileUtils.mkdir_p redirect_dir + File.open(redirect_dir + "index.html", "w") do |f| + f.puts "" + end + [tumblr_url, jekyll_url] + }] + posts.map { |post| + urls.each do |tumblr_url, jekyll_url| + post[:content].gsub!(/#{tumblr_url}/i, jekyll_url) + end + post + } + end + + # Uses Python's html2text to convert a post's content to + # markdown. Preserve HTML tables as per the markdown docs. + def self.html_to_markdown(content) + preserve = ["table", "tr", "th", "td"] + preserve.each do |tag| + content.gsub!(/<#{tag}/i, "$$" + tag) + content.gsub!(/<\/#{tag}/i, "||" + tag) + end + content = %x[echo '#{content.gsub("'", "''")}' | html2text] + preserve.each do |tag| + content.gsub!("$$" + tag, "<" + tag) + content.gsub!("||" + tag, "