diff --git a/lib/jekyll/migrators/tumblr.rb b/lib/jekyll/migrators/tumblr.rb index 10f41601..0f9c227c 100644 --- a/lib/jekyll/migrators/tumblr.rb +++ b/lib/jekyll/migrators/tumblr.rb @@ -3,93 +3,179 @@ require 'open-uri' require 'fileutils' require 'date' require 'json' +require 'uri' +require 'jekyll' module Jekyll module Tumblr - def self.process(url, grab_images = false, format = "html") - current_page = 0 - - while true - - f = open(url + "/api/read/json/?num=50&start=#{current_page * 50}") - # [21...-2] strips Tumblr's Javascript/JSONP start/end chars - json = f.readlines.join("\n")[21...-2] + def self.process(url, format = "html", grab_images = false, + add_highlights = false, rewrite_urls = true) + FileUtils.mkdir_p "_posts/tumblr" + url += "/api/read/json/" + per_page = 50 + posts = [] + # Two passes are required so that we can rewrite URLs. + # First pass builds up an array of each post as a hash. + begin + current_page = (current_page || -1) + 1 + feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}") + json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars. blog = JSON.parse(json) puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" - FileUtils.mkdir_p "_posts/tumblr" - - blog["posts"].each do |post| - - case post['type'] - when "regular" - title = post["regular-title"] - content = post["regular-body"] - when "link" - title = post["link-text"] || post["link-url"] - content = "#{title}" - content << "
" + post["link-description"] unless post["link-description"].nil? - when "photo" - title = post["photo-caption"] - content = "" - content = "#{content}" unless post["photo-link-url"].nil? - when "audio" - if !post["id3-title"].nil? - title = post["id3-title"] - content = post.at["audio-player"] + "
" + post["audio-caption"] - else - title = post["audio-caption"] - content = post.at["audio-player"] - end - when "quote" - title = post["quote-text"] - content = "
#{post["quote-text"]}
" - content << "—" + post["quote-source"] unless post["quote-source"].nil? - when "conversation" - title = post["conversation-title"] - content = "
" - post["conversation"]["line"].each do |line| - content << "
#{line['label']}
#{line}
" - end - content << "
" - when "video" - title = post["video-title"] - content = post["video-player"] - content << "
" + post["video-caption"] unless post["video-caption"].nil? - end # End post types - - name = "#{Date.parse(post['date']).to_s}-#{title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}.#{format}" - - File.open("_posts/tumblr/#{name}", "w") do |f| - if format == "md" - preserve = ["table", "tr", "th", "td"] - preserve.each { |tag| content = content.gsub(/<#{tag}/i, "$$" + tag).gsub(/<\/#{tag}/i, "||" + tag) } - content = %x[echo '#{content.gsub("'", "''")}' | html2text] - preserve.each { |tag| content = content.gsub("$$" + tag, "<" + tag).gsub("||" + tag, " "post", "title" => title, "tags" => post["tags"]} - f.puts header.to_yaml + "---\n" + content - end # End file - - end # End post XML - - if blog["posts"].size < 50 - break + posts += blog["posts"].map { |post| post_to_hash(post, format) } + end until blog["posts"].size < per_page + # Rewrite URLs and create redirects. + posts = rewrite_urls_and_redirects posts if rewrite_urls + # Second pass for writing post files. + posts.each do |post| + if format == "md" + post[:content] = html_to_markdown post[:content] + post[:content] = add_syntax_highlights post[:content] if add_highlights end - current_page += 1 - - end # End while loop - end # End method + File.open("_posts/tumblr/#{post[:name]}", "w") do |f| + f.puts post[:header].to_yaml + "---\n" + post[:content] + end + end + end private + # Converts each type of Tumblr post to a hash with all required + # data for Jekyll. + def self.post_to_hash(post, format) + case post['type'] + when "regular" + title = post["regular-title"] + content = post["regular-body"] + when "link" + title = post["link-text"] || post["link-url"] + content = "#{title}" + unless post["link-description"].nil? + content << "
" + post["link-description"] + end + when "photo" + title = post["photo-caption"] + content = "" + unless post["photo-link-url"].nil? + content = "#{content}" + end + when "audio" + if !post["id3-title"].nil? + title = post["id3-title"] + content = post.at["audio-player"] + "
" + post["audio-caption"] + else + title = post["audio-caption"] + content = post.at["audio-player"] + end + when "quote" + title = post["quote-text"] + content = "
#{post["quote-text"]}
" + unless post["quote-source"].nil? + content << "—" + post["quote-source"] + end + when "conversation" + title = post["conversation-title"] + content = "
" + post["conversation"]["line"].each do |line| + content << "
#{line['label']}
#{line}
" + end + content << "
" + when "video" + title = post["video-title"] + content = post["video-player"] + unless post["video-caption"].nil? + content << "
" + post["video-caption"] + end + end + date = Date.parse(post['date']).to_s + slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') + { + :name => "#{date}-#{slug}.#{format}", + :header => { + "layout" => "post", + "title" => title, + "tags" => post["tags"], + }, + :content => content, + :url => post["url"], + :slug => post["url-with-slug"], + } + end + + # Create a Hash of old urls => new urls, for rewriting and + # redirects, and replace urls in each post. Instantiate Jekyll + # site/posts to get the correct permalink format. + def self.rewrite_urls_and_redirects(posts) + site = Jekyll::Site.new(Jekyll.configuration({})) + dir = File.join(File.dirname(__FILE__), "..") + urls = Hash[posts.map { |post| + tumblr_url = URI.parse(post[:slug]).path + jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url + redirect_dir = tumblr_url.sub(/\//, "") + "/" + FileUtils.mkdir_p redirect_dir + File.open(redirect_dir + "index.html", "w") do |f| + f.puts "" + end + [tumblr_url, jekyll_url] + }] + posts.map { |post| + urls.each do |tumblr_url, jekyll_url| + post[:content].gsub!(/#{tumblr_url}/i, jekyll_url) + end + post + } + end + + # Uses Python's html2text to convert a post's content to + # markdown. Preserve HTML tables as per the markdown docs. + def self.html_to_markdown(content) + preserve = ["table", "tr", "th", "td"] + preserve.each do |tag| + content.gsub!(/<#{tag}/i, "$$" + tag) + content.gsub!(/<\/#{tag}/i, "||" + tag) + end + content = %x[echo '#{content.gsub("'", "''")}' | html2text] + preserve.each do |tag| + content.gsub!("$$" + tag, "<" + tag) + content.gsub!("||" + tag, "