commit
d36a1064a7
|
@ -1,119 +1,195 @@
|
||||||
require 'rubygems'
|
require 'rubygems'
|
||||||
require 'nokogiri'
|
|
||||||
require 'open-uri'
|
require 'open-uri'
|
||||||
require 'fileutils'
|
require 'fileutils'
|
||||||
require 'CGI'
|
require 'nokogiri'
|
||||||
require 'iconv'
|
|
||||||
require 'date'
|
require 'date'
|
||||||
|
require 'json'
|
||||||
|
require 'uri'
|
||||||
|
require 'jekyll'
|
||||||
|
|
||||||
module Jekyll
|
module Jekyll
|
||||||
module Tumblr
|
module Tumblr
|
||||||
def self.process(url, grab_images = false)
|
def self.process(url, format = "html", grab_images = false,
|
||||||
current_page = 0
|
add_highlights = false, rewrite_urls = true)
|
||||||
|
@grab_images = grab_images
|
||||||
while true
|
FileUtils.mkdir_p "_posts/tumblr"
|
||||||
f = open(url + "/api/read?num=50&start=#{current_page * 50}")
|
url += "/api/read/json/"
|
||||||
doc = Nokogiri::HTML(Iconv.conv("utf-8", f.charset, f.readlines.join("\n")))
|
per_page = 50
|
||||||
|
posts = []
|
||||||
puts "Page: #{current_page + 1} - Posts: #{(doc/:tumblr/:posts/:post).size}"
|
# Two passes are required so that we can rewrite URLs.
|
||||||
|
# First pass builds up an array of each post as a hash.
|
||||||
FileUtils.mkdir_p "_posts/tumblr"
|
begin
|
||||||
|
current_page = (current_page || -1) + 1
|
||||||
(doc/:tumblr/:posts/:post).each do |post|
|
feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}")
|
||||||
title = ""
|
json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars.
|
||||||
content = nil
|
blog = JSON.parse(json)
|
||||||
name = nil
|
puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
|
||||||
|
posts += blog["posts"].map { |post| post_to_hash(post, format) }
|
||||||
if post['type'] == "regular"
|
end until blog["posts"].size < per_page
|
||||||
title_element = post.at("regular-title")
|
# Rewrite URLs and create redirects.
|
||||||
title = title_element.inner_text unless title_element == nil
|
posts = rewrite_urls_and_redirects posts if rewrite_urls
|
||||||
content = CGI::unescapeHTML post.at("regular-body").inner_html unless post.at("regular-body") == nil
|
# Second pass for writing post files.
|
||||||
elsif post['type'] == "link"
|
posts.each do |post|
|
||||||
title = post.at("link-text").inner_html unless post.at("link-text") == nil
|
if format == "md"
|
||||||
|
post[:content] = html_to_markdown post[:content]
|
||||||
if post.at("link-text") != nil
|
post[:content] = add_syntax_highlights post[:content] if add_highlights
|
||||||
content = "<a href=\"#{post.at("link-url").inner_html}\">#{post.at("link-text").inner_html}</a>"
|
|
||||||
else
|
|
||||||
content = "<a href=\"#{post.at("link-url").inner_html}\">#{post.at("link-url").inner_html}</a>"
|
|
||||||
end
|
|
||||||
|
|
||||||
content << "<br/>" + CGI::unescapeHTML(post.at("link-description").inner_html) unless post.at("link-description") == nil
|
|
||||||
elsif post['type'] == "photo"
|
|
||||||
content = ""
|
|
||||||
|
|
||||||
if post.at("photo-link-url") != nil
|
|
||||||
content = "<a href=\"#{post.at("photo-link-url").inner_html}\"><img src=\"#{save_file((post/"photo-url")[1].inner_html, grab_images)}\"/></a>"
|
|
||||||
else
|
|
||||||
content = "<img src=\"#{save_file((post/"photo-url")[1].inner_html, grab_images)}\"/>"
|
|
||||||
end
|
|
||||||
|
|
||||||
if post.at("photo-caption") != nil
|
|
||||||
content << "<br/>" unless content == nil
|
|
||||||
content << CGI::unescapeHTML(post.at("photo-caption").inner_html)
|
|
||||||
end
|
|
||||||
elsif post['type'] == "audio"
|
|
||||||
content = CGI::unescapeHTML(post.at("audio-player").inner_html)
|
|
||||||
content << CGI::unescapeHTML(post.at("audio-caption").inner_html) unless post.at("audio-caption") == nil
|
|
||||||
elsif post['type'] == "quote"
|
|
||||||
content = "<blockquote>" + CGI::unescapeHTML(post.at("quote-text").inner_html) + "</blockquote>"
|
|
||||||
content << "—" + CGI::unescapeHTML(post.at("quote-source").inner_html) unless post.at("quote-source") == nil
|
|
||||||
elsif post['type'] == "conversation"
|
|
||||||
title = post.at("conversation-title").inner_html unless post.at("conversation-title") == nil
|
|
||||||
content = "<section><dialog>"
|
|
||||||
|
|
||||||
(post/:conversation/:line).each do |line|
|
|
||||||
content << "<dt>" + line['label'] + "</dt><dd>" + line.inner_html + "</dd>" unless line['label'] == nil || line == nil
|
|
||||||
end
|
|
||||||
|
|
||||||
content << "</section></dialog>"
|
|
||||||
elsif post['type'] == "video"
|
|
||||||
title = post.at("video-title").inner_html unless post.at("video-title") == nil
|
|
||||||
content = CGI::unescapeHTML(post.at("video-player").inner_html)
|
|
||||||
content << CGI::unescapeHTML(post.at("video-caption").inner_html) unless post.at("video-caption") == nil
|
|
||||||
end # End post types
|
|
||||||
|
|
||||||
name = "#{Date.parse(post['date']).to_s}-#{post['id'].downcase.gsub(/[^a-z0-9]/, '-')}.html"
|
|
||||||
|
|
||||||
if title != nil || content != nil && name != nil
|
|
||||||
File.open("_posts/tumblr/#{name}", "w") do |f|
|
|
||||||
|
|
||||||
f.puts <<-HEADER
|
|
||||||
---
|
|
||||||
layout: post
|
|
||||||
title: #{title}
|
|
||||||
---
|
|
||||||
|
|
||||||
HEADER
|
|
||||||
|
|
||||||
f.puts content
|
|
||||||
end # End file
|
|
||||||
end
|
|
||||||
|
|
||||||
end # End post XML
|
|
||||||
|
|
||||||
if (doc/:tumblr/:posts/:post).size < 50
|
|
||||||
break
|
|
||||||
else
|
|
||||||
current_page = current_page + 1
|
|
||||||
end
|
end
|
||||||
|
File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
|
||||||
end # End while loop
|
f.puts post[:header].to_yaml + "---\n" + post[:content]
|
||||||
end # End method
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
private
|
private
|
||||||
|
|
||||||
def self.save_file(url, grab_image = false)
|
# Converts each type of Tumblr post to a hash with all required
|
||||||
unless grab_image == false
|
# data for Jekyll.
|
||||||
FileUtils.mkdir_p "tumblr_files"
|
def self.post_to_hash(post, format)
|
||||||
|
case post['type']
|
||||||
File.open("tumblr_files/#{url.split('/').last}", "w") do |f|
|
when "regular"
|
||||||
f.write(open(url).read)
|
title = post["regular-title"]
|
||||||
end
|
content = post["regular-body"]
|
||||||
|
when "link"
|
||||||
return "/tumblr_files/#{url.split('/').last}"
|
title = post["link-text"] || post["link-url"]
|
||||||
else
|
content = "<a href=\"#{post["link-url"]}\">#{title}</a>"
|
||||||
return url
|
unless post["link-description"].nil?
|
||||||
|
content << "<br/>" + post["link-description"]
|
||||||
|
end
|
||||||
|
when "photo"
|
||||||
|
title = post["photo-caption"]
|
||||||
|
max_size = post.keys.map{ |k| k.gsub("photo-url-", "").to_i }.max
|
||||||
|
url = post["photo-url"] || post["photo-url-#{max_size}"]
|
||||||
|
ext = "." + post[post.keys.select { |k|
|
||||||
|
k =~ /^photo-url-/ && post[k].split("/").last =~ /\./
|
||||||
|
}.first].split(".").last
|
||||||
|
content = "<img src=\"#{save_file(url, ext)}\"/>"
|
||||||
|
unless post["photo-link-url"].nil?
|
||||||
|
content = "<a href=\"#{post["photo-link-url"]}\">#{content}</a>"
|
||||||
|
end
|
||||||
|
when "audio"
|
||||||
|
if !post["id3-title"].nil?
|
||||||
|
title = post["id3-title"]
|
||||||
|
content = post.at["audio-player"] + "<br/>" + post["audio-caption"]
|
||||||
|
else
|
||||||
|
title = post["audio-caption"]
|
||||||
|
content = post.at["audio-player"]
|
||||||
|
end
|
||||||
|
when "quote"
|
||||||
|
title = post["quote-text"]
|
||||||
|
content = "<blockquote>#{post["quote-text"]}</blockquote>"
|
||||||
|
unless post["quote-source"].nil?
|
||||||
|
content << "—" + post["quote-source"]
|
||||||
|
end
|
||||||
|
when "conversation"
|
||||||
|
title = post["conversation-title"]
|
||||||
|
content = "<section><dialog>"
|
||||||
|
post["conversation"]["line"].each do |line|
|
||||||
|
content << "<dt>#{line['label']}</dt><dd>#{line}</dd>"
|
||||||
|
end
|
||||||
|
content << "</section></dialog>"
|
||||||
|
when "video"
|
||||||
|
title = post["video-title"]
|
||||||
|
content = post["video-player"]
|
||||||
|
unless post["video-caption"].nil?
|
||||||
|
content << "<br/>" + post["video-caption"]
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
date = Date.parse(post['date']).to_s
|
||||||
|
title = Nokogiri::HTML(title).text
|
||||||
|
slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
|
||||||
|
{
|
||||||
|
:name => "#{date}-#{slug}.#{format}",
|
||||||
|
:header => {
|
||||||
|
"layout" => "post",
|
||||||
|
"title" => title,
|
||||||
|
"tags" => post["tags"],
|
||||||
|
},
|
||||||
|
:content => content,
|
||||||
|
:url => post["url"],
|
||||||
|
:slug => post["url-with-slug"],
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Create a Hash of old urls => new urls, for rewriting and
|
||||||
|
# redirects, and replace urls in each post. Instantiate Jekyll
|
||||||
|
# site/posts to get the correct permalink format.
|
||||||
|
def self.rewrite_urls_and_redirects(posts)
|
||||||
|
site = Jekyll::Site.new(Jekyll.configuration({}))
|
||||||
|
dir = File.join(File.dirname(__FILE__), "..")
|
||||||
|
urls = Hash[posts.map { |post|
|
||||||
|
# Create an initial empty file for the post so that
|
||||||
|
# we can instantiate a post object.
|
||||||
|
File.open("_posts/tumblr/#{post[:name]}", "w")
|
||||||
|
tumblr_url = URI.parse(post[:slug]).path
|
||||||
|
jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url
|
||||||
|
redirect_dir = tumblr_url.sub(/\//, "") + "/"
|
||||||
|
FileUtils.mkdir_p redirect_dir
|
||||||
|
File.open(redirect_dir + "index.html", "w") do |f|
|
||||||
|
f.puts "<html><head><meta http-equiv='Refresh' content='0; " +
|
||||||
|
"url=#{jekyll_url}'></head><body></body></html>"
|
||||||
|
end
|
||||||
|
[tumblr_url, jekyll_url]
|
||||||
|
}]
|
||||||
|
posts.map { |post|
|
||||||
|
urls.each do |tumblr_url, jekyll_url|
|
||||||
|
post[:content].gsub!(/#{tumblr_url}/i, jekyll_url)
|
||||||
|
end
|
||||||
|
post
|
||||||
|
}
|
||||||
|
end
|
||||||
|
|
||||||
|
# Uses Python's html2text to convert a post's content to
|
||||||
|
# markdown. Preserve HTML tables as per the markdown docs.
|
||||||
|
def self.html_to_markdown(content)
|
||||||
|
preserve = ["table", "tr", "th", "td"]
|
||||||
|
preserve.each do |tag|
|
||||||
|
content.gsub!(/<#{tag}/i, "$$" + tag)
|
||||||
|
content.gsub!(/<\/#{tag}/i, "||" + tag)
|
||||||
|
end
|
||||||
|
content = %x[echo '#{content.gsub("'", "''")}' | html2text]
|
||||||
|
preserve.each do |tag|
|
||||||
|
content.gsub!("$$" + tag, "<" + tag)
|
||||||
|
content.gsub!("||" + tag, "</" + tag)
|
||||||
|
end
|
||||||
|
content
|
||||||
|
end
|
||||||
|
|
||||||
|
# Adds pygments highlight tags to code blocks in posts that use
|
||||||
|
# markdown format. This doesn't guess the language of the code
|
||||||
|
# block, so you should modify this to suit your own content.
|
||||||
|
# For example, my code block only contain Python and JavaScript,
|
||||||
|
# so I can assume the block is JavaScript if it contains a
|
||||||
|
# semi-colon.
|
||||||
|
def self.add_syntax_highlights(content)
|
||||||
|
lines = content.split("\n")
|
||||||
|
block, indent, lang, start = false, /^ /, nil, nil
|
||||||
|
lines.each_with_index do |line, i|
|
||||||
|
if !block && line =~ indent
|
||||||
|
block = true
|
||||||
|
lang = "python"
|
||||||
|
start = i
|
||||||
|
elsif block
|
||||||
|
lang = "javascript" if line =~ /;$/
|
||||||
|
block = line =~ indent && i < lines.size - 1 # Also handle EOF
|
||||||
|
if !block
|
||||||
|
lines[start] = "{% highlight #{lang} %}"
|
||||||
|
lines[i - 1] = "{% endhighlight %}"
|
||||||
|
end
|
||||||
|
lines[i] = lines[i].sub(indent, "")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
lines.join("\n")
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.save_file(url, ext)
|
||||||
|
if @grab_images
|
||||||
|
path = "tumblr_files/#{url.split('/').last}"
|
||||||
|
path += ext unless path =~ /#{ext}$/
|
||||||
|
FileUtils.mkdir_p "tumblr_files"
|
||||||
|
File.open(path, "w") { |f| f.write(open(url).read) }
|
||||||
|
url = "/" + path
|
||||||
|
end
|
||||||
|
url
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue