diff --git a/lib/jekyll/migrators/tumblr.rb b/lib/jekyll/migrators/tumblr.rb
index 10f41601..0f9c227c 100644
--- a/lib/jekyll/migrators/tumblr.rb
+++ b/lib/jekyll/migrators/tumblr.rb
@@ -3,93 +3,179 @@ require 'open-uri'
require 'fileutils'
require 'date'
require 'json'
+require 'uri'
+require 'jekyll'
module Jekyll
module Tumblr
- def self.process(url, grab_images = false, format = "html")
- current_page = 0
-
- while true
-
- f = open(url + "/api/read/json/?num=50&start=#{current_page * 50}")
- # [21...-2] strips Tumblr's Javascript/JSONP start/end chars
- json = f.readlines.join("\n")[21...-2]
+ def self.process(url, format = "html", grab_images = false,
+ add_highlights = false, rewrite_urls = true)
+ FileUtils.mkdir_p "_posts/tumblr"
+ url += "/api/read/json/"
+ per_page = 50
+ posts = []
+ # Two passes are required so that we can rewrite URLs.
+ # First pass builds up an array of each post as a hash.
+ begin
+ current_page = (current_page || -1) + 1
+ feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}")
+ json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars.
blog = JSON.parse(json)
puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
- FileUtils.mkdir_p "_posts/tumblr"
-
- blog["posts"].each do |post|
-
- case post['type']
- when "regular"
- title = post["regular-title"]
- content = post["regular-body"]
- when "link"
- title = post["link-text"] || post["link-url"]
- content = "#{title}"
- content << "
" + post["link-description"] unless post["link-description"].nil?
- when "photo"
- title = post["photo-caption"]
- content = "
"
- content = "#{content}" unless post["photo-link-url"].nil?
- when "audio"
- if !post["id3-title"].nil?
- title = post["id3-title"]
- content = post.at["audio-player"] + "
" + post["audio-caption"]
- else
- title = post["audio-caption"]
- content = post.at["audio-player"]
- end
- when "quote"
- title = post["quote-text"]
- content = "
#{post["quote-text"]}
"
- content << "—" + post["quote-source"] unless post["quote-source"].nil?
- when "conversation"
- title = post["conversation-title"]
- content = ""
- when "video"
- title = post["video-title"]
- content = post["video-player"]
- content << "
" + post["video-caption"] unless post["video-caption"].nil?
- end # End post types
-
- name = "#{Date.parse(post['date']).to_s}-#{title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}.#{format}"
-
- File.open("_posts/tumblr/#{name}", "w") do |f|
- if format == "md"
- preserve = ["table", "tr", "th", "td"]
- preserve.each { |tag| content = content.gsub(/<#{tag}/i, "$$" + tag).gsub(/<\/#{tag}/i, "||" + tag) }
- content = %x[echo '#{content.gsub("'", "''")}' | html2text]
- preserve.each { |tag| content = content.gsub("$$" + tag, "<" + tag).gsub("||" + tag, "" + tag) }
- end
- header = {"layout" => "post", "title" => title, "tags" => post["tags"]}
- f.puts header.to_yaml + "---\n" + content
- end # End file
-
- end # End post XML
-
- if blog["posts"].size < 50
- break
+ posts += blog["posts"].map { |post| post_to_hash(post, format) }
+ end until blog["posts"].size < per_page
+ # Rewrite URLs and create redirects.
+ posts = rewrite_urls_and_redirects posts if rewrite_urls
+ # Second pass for writing post files.
+ posts.each do |post|
+ if format == "md"
+ post[:content] = html_to_markdown post[:content]
+ post[:content] = add_syntax_highlights post[:content] if add_highlights
end
- current_page += 1
-
- end # End while loop
- end # End method
+ File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
+ f.puts post[:header].to_yaml + "---\n" + post[:content]
+ end
+ end
+ end
private
+ # Converts each type of Tumblr post to a hash with all required
+ # data for Jekyll.
+ def self.post_to_hash(post, format)
+ case post['type']
+ when "regular"
+ title = post["regular-title"]
+ content = post["regular-body"]
+ when "link"
+ title = post["link-text"] || post["link-url"]
+ content = "#{title}"
+ unless post["link-description"].nil?
+ content << "
" + post["link-description"]
+ end
+ when "photo"
+ title = post["photo-caption"]
+ content = "
"
+ unless post["photo-link-url"].nil?
+ content = "#{content}"
+ end
+ when "audio"
+ if !post["id3-title"].nil?
+ title = post["id3-title"]
+ content = post.at["audio-player"] + "
" + post["audio-caption"]
+ else
+ title = post["audio-caption"]
+ content = post.at["audio-player"]
+ end
+ when "quote"
+ title = post["quote-text"]
+ content = "#{post["quote-text"]}
"
+ unless post["quote-source"].nil?
+ content << "—" + post["quote-source"]
+ end
+ when "conversation"
+ title = post["conversation-title"]
+ content = ""
+ when "video"
+ title = post["video-title"]
+ content = post["video-player"]
+ unless post["video-caption"].nil?
+ content << "
" + post["video-caption"]
+ end
+ end
+ date = Date.parse(post['date']).to_s
+ slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
+ {
+ :name => "#{date}-#{slug}.#{format}",
+ :header => {
+ "layout" => "post",
+ "title" => title,
+ "tags" => post["tags"],
+ },
+ :content => content,
+ :url => post["url"],
+ :slug => post["url-with-slug"],
+ }
+ end
+
+ # Create a Hash of old urls => new urls, for rewriting and
+ # redirects, and replace urls in each post. Instantiate Jekyll
+ # site/posts to get the correct permalink format.
+ def self.rewrite_urls_and_redirects(posts)
+ site = Jekyll::Site.new(Jekyll.configuration({}))
+ dir = File.join(File.dirname(__FILE__), "..")
+ urls = Hash[posts.map { |post|
+ tumblr_url = URI.parse(post[:slug]).path
+ jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url
+ redirect_dir = tumblr_url.sub(/\//, "") + "/"
+ FileUtils.mkdir_p redirect_dir
+ File.open(redirect_dir + "index.html", "w") do |f|
+ f.puts ""
+ end
+ [tumblr_url, jekyll_url]
+ }]
+ posts.map { |post|
+ urls.each do |tumblr_url, jekyll_url|
+ post[:content].gsub!(/#{tumblr_url}/i, jekyll_url)
+ end
+ post
+ }
+ end
+
+ # Uses Python's html2text to convert a post's content to
+ # markdown. Preserve HTML tables as per the markdown docs.
+ def self.html_to_markdown(content)
+ preserve = ["table", "tr", "th", "td"]
+ preserve.each do |tag|
+ content.gsub!(/<#{tag}/i, "$$" + tag)
+ content.gsub!(/<\/#{tag}/i, "||" + tag)
+ end
+ content = %x[echo '#{content.gsub("'", "''")}' | html2text]
+ preserve.each do |tag|
+ content.gsub!("$$" + tag, "<" + tag)
+ content.gsub!("||" + tag, "" + tag)
+ end
+ content
+ end
+
+ # Adds pygments highlight tags to code blocks in posts that use
+ # markdown format. This doesn't guess the language of the code
+ # block, so you should modify this to suit your own content.
+ # For example, my code block only contain Python and JavaScript,
+ # so I can assume the block is JavaScript if it contains a
+ # semi-colon.
+ def self.add_syntax_highlights(content)
+ lines = content.split("\n")
+ block, indent, lang, start = false, /^ /, nil, nil
+ lines.each_with_index do |line, i|
+ if !block && line =~ indent
+ block = true
+ lang = "python"
+ start = i
+ elsif block
+ lang = "javascript" if line =~ /;$/
+ block = line =~ indent && i < lines.size - 1 # Also handle EOF
+ if !block
+ lines[start] = "{% highlight #{lang} %}"
+ lines[i - 1] = "{% endhighlight %}"
+ end
+ lines[i] = lines[i].sub(indent, "")
+ end
+ end
+ lines.join("\n")
+ end
+
def self.save_file(url, grab_image = false)
unless grab_image == false
FileUtils.mkdir_p "tumblr_files"
-
File.open("tumblr_files/#{url.split('/').last}", "w") do |f|
f.write(open(url).read)
end
-
return "/tumblr_files/#{url.split('/').last}"
else
return url