diff --git a/Rakefile b/Rakefile index d1917c5a..ff52d677 100644 --- a/Rakefile +++ b/Rakefile @@ -82,6 +82,21 @@ end # ############################################################################# +namespace :migrate do + desc "Migrate from mephisto in the current directory" + task :mephisto do + sh %q(ruby -r './lib/jekyll/migrators/mephisto' -e 'Jekyll::Mephisto.postgres(:database => "#{ENV["DB"]}")') + end + desc "Migrate from Movable Type in the current directory" + task :mt do + sh %q(ruby -r './lib/jekyll/migrators/mt' -e 'Jekyll::MT.process("#{ENV["DB"]}", "#{ENV["USER"]}", "#{ENV["PASS"]}")') + end + desc "Migrate from Typo in the current directory" + task :typo do + sh %q(ruby -r './lib/jekyll/migrators/typo' -e 'Jekyll::Typo.process("#{ENV["DB"]}", "#{ENV["USER"]}", "#{ENV["PASS"]}")') + end +end + begin require 'cucumber/rake/task' Cucumber::Rake::Task.new(:features) do |t| diff --git a/lib/jekyll/migrators/csv.rb b/lib/jekyll/migrators/csv.rb new file mode 100644 index 00000000..ce5203b7 --- /dev/null +++ b/lib/jekyll/migrators/csv.rb @@ -0,0 +1,26 @@ +module Jekyll + module CSV + # Reads a csv with title, permalink, body, published_at, and filter. + # It creates a post file for each row in the csv + def self.process(file = "posts.csv") + FileUtils.mkdir_p "_posts" + posts = 0 + FasterCSV.foreach(file) do |row| + next if row[0] == "title" + posts += 1 + name = row[3].split(" ")[0]+"-"+row[1]+(row[4] =~ /markdown/ ? ".markdown" : ".textile") + File.open("_posts/#{name}", "w") do |f| + f.puts <<-HEADER +--- +layout: post +title: #{row[0]} +--- + + HEADER + f.puts row[2] + end + end + "Created #{posts} posts!" + end + end +end diff --git a/lib/jekyll/migrators/drupal.rb b/lib/jekyll/migrators/drupal.rb new file mode 100644 index 00000000..7fd16aef --- /dev/null +++ b/lib/jekyll/migrators/drupal.rb @@ -0,0 +1,103 @@ +require 'rubygems' +require 'sequel' +require 'fileutils' +require 'yaml' + +# NOTE: This converter requires Sequel and the MySQL gems. +# The MySQL gem can be difficult to install on OS X. Once you have MySQL +# installed, running the following commands should work: +# $ sudo gem install sequel +# $ sudo gem install mysql -- --with-mysql-config=/usr/local/mysql/bin/mysql_config + +module Jekyll + module Drupal + # Reads a MySQL database via Sequel and creates a post file for each post + # in wp_posts that has post_status = 'publish'. This restriction is made + # because 'draft' posts are not guaranteed to have valid dates. + QUERY = "SELECT n.nid, \ + n.title, \ + nr.body, \ + n.created, \ + n.status \ + FROM node AS n, \ + node_revisions AS nr \ + WHERE (n.type = 'blog' OR n.type = 'story') \ + AND n.vid = nr.vid" + + def self.process(dbname, user, pass, host = 'localhost', prefix = '') + db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8') + + if prefix != '' + QUERY[" node "] = " " + prefix + "node " + QUERY[" node_revisions "] = " " + prefix + "node_revisions " + end + + FileUtils.mkdir_p "_posts" + FileUtils.mkdir_p "_drafts" + + # Create the refresh layout + # Change the refresh url if you customized your permalink config + File.open("_layouts/refresh.html", "w") do |f| + f.puts < + + + + + + +EOF + end + + db[QUERY].each do |post| + # Get required fields and construct Jekyll compatible name + node_id = post[:nid] + title = post[:title] + content = post[:body] + created = post[:created] + time = Time.at(created) + is_published = post[:status] == 1 + dir = is_published ? "_posts" : "_drafts" + slug = title.strip.downcase.gsub(/(&|&)/, ' and ').gsub(/[\s\.\/\\]/, '-').gsub(/[^\w-]/, '').gsub(/[-_]{2,}/, '-').gsub(/^[-_]/, '').gsub(/[-_]$/, '') + name = time.strftime("%Y-%m-%d-") + slug + '.md' + + # Get the relevant fields as a hash, delete empty fields and convert + # to YAML for the header + data = { + 'layout' => 'post', + 'title' => title.to_s, + 'created' => created, + }.delete_if { |k,v| v.nil? || v == ''}.to_yaml + + # Write out the data and content to file + File.open("#{dir}/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + + # Make a file to redirect from the old Drupal URL + if is_published + aliases = db["SELECT dst FROM #{prefix}url_alias WHERE src = ?", "node/#{node_id}"].all + + aliases.push(:dst => "node/#{node_id}") + + aliases.each do |url_alias| + FileUtils.mkdir_p url_alias[:dst] + File.open("#{url_alias[:dst]}/index.md", "w") do |f| + f.puts "---" + f.puts "layout: refresh" + f.puts "refresh_to_post_id: /#{time.strftime("%Y/%m/%d/") + slug}" + f.puts "---" + end + end + end + end + + # TODO: Make dirs & files for nodes of type 'page' + # Make refresh pages for these as well + + # TODO: Make refresh dirs & files according to entries in url_alias table + end + end +end diff --git a/lib/jekyll/migrators/enki.rb b/lib/jekyll/migrators/enki.rb new file mode 100644 index 00000000..61cb2562 --- /dev/null +++ b/lib/jekyll/migrators/enki.rb @@ -0,0 +1,49 @@ +# Adapted by Rodrigo Pinto +# Based on typo.rb by Toby DiPasquale + +require 'fileutils' +require 'rubygems' +require 'sequel' + +module Jekyll + module Enki + SQL = <<-EOS + SELECT p.id, + p.title, + p.slug, + p.body, + p.published_at as date, + p.cached_tag_list as tags + FROM posts p + EOS + + # Just working with postgres, but can be easily adapted + # to work with both mysql and postgres. + def self.process(dbname, user, pass, host = 'localhost') + FileUtils.mkdir_p('_posts') + db = Sequel.postgres(:database => dbname, + :user => user, + :password => pass, + :host => host, + :encoding => 'utf8') + + db[SQL].each do |post| + name = [ sprintf("%.04d", post[:date].year), + sprintf("%.02d", post[:date].month), + sprintf("%.02d", post[:date].day), + post[:slug].strip ].join('-') + name += '.textile' + + File.open("_posts/#{name}", 'w') do |f| + f.puts({ 'layout' => 'post', + 'title' => post[:title].to_s, + 'enki_id' => post[:id], + 'categories' => post[:tags] + }.delete_if { |k, v| v.nil? || v == '' }.to_yaml) + f.puts '---' + f.puts post[:body].delete("\r") + end + end + end + end +end diff --git a/lib/jekyll/migrators/joomla.rb b/lib/jekyll/migrators/joomla.rb new file mode 100644 index 00000000..87f1e105 --- /dev/null +++ b/lib/jekyll/migrators/joomla.rb @@ -0,0 +1,53 @@ +require 'rubygems' +require 'sequel' +require 'fileutils' +require 'yaml' + +# NOTE: This migrator is made for Joomla 1.5 databases. +# NOTE: This converter requires Sequel and the MySQL gems. +# The MySQL gem can be difficult to install on OS X. Once you have MySQL +# installed, running the following commands should work: +# $ sudo gem install sequel +# $ sudo gem install mysql -- --with-mysql-config=/usr/local/mysql/bin/mysql_config + +module Jekyll + module Joomla + def self.process(dbname, user, pass, host = 'localhost', table_prefix = 'jos_', section = '1') + db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8') + + FileUtils.mkdir_p("_posts") + + # Reads a MySQL database via Sequel and creates a post file for each + # post in wp_posts that has post_status = 'publish'. This restriction is + # made because 'draft' posts are not guaranteed to have valid dates. + query = "SELECT `title`, `alias`, CONCAT(`introtext`,`fulltext`) as content, `created`, `id` FROM #{table_prefix}content WHERE state = '0' OR state = '1' AND sectionid = '#{section}'" + + db[query].each do |post| + # Get required fields and construct Jekyll compatible name. + title = post[:title] + slug = post[:alias] + date = post[:created] + content = post[:content] + name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, date.day, + slug] + + # Get the relevant fields as a hash, delete empty fields and convert + # to YAML for the header. + data = { + 'layout' => 'post', + 'title' => title.to_s, + 'joomla_id' => post[:id], + 'joomla_url' => post[:alias], + 'date' => date + }.delete_if { |k,v| v.nil? || v == '' }.to_yaml + + # Write out the data and content to file + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + end + end +end diff --git a/lib/jekyll/migrators/marley.rb b/lib/jekyll/migrators/marley.rb new file mode 100644 index 00000000..21bcead5 --- /dev/null +++ b/lib/jekyll/migrators/marley.rb @@ -0,0 +1,52 @@ +require 'yaml' +require 'fileutils' + +module Jekyll + module Marley + def self.regexp + { :id => /^\d{0,4}-{0,1}(.*)$/, + :title => /^#\s*(.*)\s+$/, + :title_with_date => /^#\s*(.*)\s+\(([0-9\/]+)\)$/, + :published_on => /.*\s+\(([0-9\/]+)\)$/, + :perex => /^([^\#\n]+\n)$/, + :meta => /^\{\{\n(.*)\}\}\n$/mi # Multiline Regexp + } + end + + def self.process(marley_data_dir) + raise ArgumentError, "marley dir #{marley_data_dir} not found" unless File.directory?(marley_data_dir) + + FileUtils.mkdir_p "_posts" + + posts = 0 + Dir["#{marley_data_dir}/**/*.txt"].each do |f| + next unless File.exists?(f) + + #copied over from marley's app/lib/post.rb + file_content = File.read(f) + meta_content = file_content.slice!( self.regexp[:meta] ) + body = file_content.sub( self.regexp[:title], '').sub( self.regexp[:perex], '').strip + + title = file_content.scan( self.regexp[:title] ).first.to_s.strip + prerex = file_content.scan( self.regexp[:perex] ).first.to_s.strip + published_on = DateTime.parse( post[:published_on] ) rescue File.mtime( File.dirname(f) ) + meta = ( meta_content ) ? YAML::load( meta_content.scan( self.regexp[:meta]).to_s ) : {} + meta['title'] = title + meta['layout'] = 'post' + + formatted_date = published_on.strftime('%Y-%m-%d') + post_name = File.dirname(f).split(%r{/}).last.gsub(/\A\d+-/, '') + + name = "#{formatted_date}-#{post_name}" + File.open("_posts/#{name}.markdown", "w") do |f| + f.puts meta.to_yaml + f.puts "---\n" + f.puts "\n#{prerex}\n\n" if prerex + f.puts body + end + posts += 1 + end + "Created #{posts} posts!" + end + end +end diff --git a/lib/jekyll/migrators/mephisto.rb b/lib/jekyll/migrators/mephisto.rb new file mode 100644 index 00000000..7622c722 --- /dev/null +++ b/lib/jekyll/migrators/mephisto.rb @@ -0,0 +1,84 @@ +# Quickly hacked together my Michael Ivey +# Based on mt.rb by Nick Gerakines, open source and publically +# available under the MIT license. Use this module at your own risk. + +require 'rubygems' +require 'sequel' +require 'fastercsv' +require 'fileutils' +require File.join(File.dirname(__FILE__),"csv.rb") + +# NOTE: This converter requires Sequel and the MySQL gems. +# The MySQL gem can be difficult to install on OS X. Once you have MySQL +# installed, running the following commands should work: +# $ sudo gem install sequel +# $ sudo gem install mysql -- --with-mysql-config=/usr/local/mysql/bin/mysql_config + +module Jekyll + module Mephisto + #Accepts a hash with database config variables, exports mephisto posts into a csv + #export PGPASSWORD if you must + def self.postgres(c) + sql = <<-SQL + BEGIN; + CREATE TEMP TABLE jekyll AS + SELECT title, permalink, body, published_at, filter FROM contents + WHERE user_id = 1 AND type = 'Article' ORDER BY published_at; + COPY jekyll TO STDOUT WITH CSV HEADER; + ROLLBACK; + SQL + command = %Q(psql -h #{c[:host] || "localhost"} -c "#{sql.strip}" #{c[:database]} #{c[:username]} -o #{c[:filename] || "posts.csv"}) + puts command + `#{command}` + CSV.process + end + + # This query will pull blog posts from all entries across all blogs. If + # you've got unpublished, deleted or otherwise hidden posts please sift + # through the created posts to make sure nothing is accidently published. + QUERY = "SELECT id, \ + permalink, \ + body, \ + published_at, \ + title \ + FROM contents \ + WHERE user_id = 1 AND \ + type = 'Article' AND \ + published_at IS NOT NULL \ + ORDER BY published_at" + + def self.process(dbname, user, pass, host = 'localhost') + db = Sequel.mysql(dbname, :user => user, + :password => pass, + :host => host, + :encoding => 'utf8') + + FileUtils.mkdir_p "_posts" + + db[QUERY].each do |post| + title = post[:title] + slug = post[:permalink] + date = post[:published_at] + content = post[:body] + + # Ideally, this script would determine the post format (markdown, + # html, etc) and create files with proper extensions. At this point + # it just assumes that markdown will be acceptable. + name = [date.year, date.month, date.day, slug].join('-') + ".markdown" + + data = { + 'layout' => 'post', + 'title' => title.to_s, + 'mt_id' => post[:entry_id], + }.delete_if { |k,v| v.nil? || v == ''}.to_yaml + + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + + end + end +end diff --git a/lib/jekyll/migrators/mt.rb b/lib/jekyll/migrators/mt.rb new file mode 100644 index 00000000..048c84db --- /dev/null +++ b/lib/jekyll/migrators/mt.rb @@ -0,0 +1,86 @@ +# Created by Nick Gerakines, open source and publically available under the +# MIT license. Use this module at your own risk. +# I'm an Erlang/Perl/C++ guy so please forgive my dirty ruby. + +require 'rubygems' +require 'sequel' +require 'fileutils' +require 'yaml' + +# NOTE: This converter requires Sequel and the MySQL gems. +# The MySQL gem can be difficult to install on OS X. Once you have MySQL +# installed, running the following commands should work: +# $ sudo gem install sequel +# $ sudo gem install mysql -- --with-mysql-config=/usr/local/mysql/bin/mysql_config + +module Jekyll + module MT + # This query will pull blog posts from all entries across all blogs. If + # you've got unpublished, deleted or otherwise hidden posts please sift + # through the created posts to make sure nothing is accidently published. + QUERY = "SELECT entry_id, \ + entry_basename, \ + entry_text, \ + entry_text_more, \ + entry_authored_on, \ + entry_title, \ + entry_convert_breaks \ + FROM mt_entry" + + def self.process(dbname, user, pass, host = 'localhost') + db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8') + + FileUtils.mkdir_p "_posts" + + db[QUERY].each do |post| + title = post[:entry_title] + slug = post[:entry_basename].gsub(/_/, '-') + date = post[:entry_authored_on] + content = post[:entry_text] + more_content = post[:entry_text_more] + entry_convert_breaks = post[:entry_convert_breaks] + + # Be sure to include the body and extended body. + if more_content != nil + content = content + " \n" + more_content + end + + # Ideally, this script would determine the post format (markdown, + # html, etc) and create files with proper extensions. At this point + # it just assumes that markdown will be acceptable. + name = [date.year, date.month, date.day, slug].join('-') + '.' + + self.suffix(entry_convert_breaks) + + data = { + 'layout' => 'post', + 'title' => title.to_s, + 'mt_id' => post[:entry_id], + 'date' => date + }.delete_if { |k,v| v.nil? || v == '' }.to_yaml + + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + end + + def self.suffix(entry_type) + if entry_type.nil? || entry_type.include?("markdown") + # The markdown plugin I have saves this as + # "markdown_with_smarty_pants", so I just look for "markdown". + "markdown" + elsif entry_type.include?("textile") + # This is saved as "textile_2" on my installation of MT 5.1. + "textile" + elsif entry_type == "0" || entry_type.include?("richtext") + # Richtext looks to me like it's saved as HTML, so I include it here. + "html" + else + # Other values might need custom work. + entry_type + end + end + end +end diff --git a/lib/jekyll/migrators/posterous.rb b/lib/jekyll/migrators/posterous.rb new file mode 100644 index 00000000..0a2280f2 --- /dev/null +++ b/lib/jekyll/migrators/posterous.rb @@ -0,0 +1,67 @@ +require 'rubygems' +require 'jekyll' +require 'fileutils' +require 'net/http' +require 'uri' +require "json" + +# ruby -r './lib/jekyll/migrators/posterous.rb' -e 'Jekyll::Posterous.process(email, pass, api_key, blog)' + +module Jekyll + module Posterous + def self.fetch(uri_str, limit = 10) + # You should choose better exception. + raise ArgumentError, 'Stuck in a redirect loop. Please double check your email and password' if limit == 0 + + response = nil + Net::HTTP.start('posterous.com') do |http| + req = Net::HTTP::Get.new(uri_str) + req.basic_auth @email, @pass + response = http.request(req) + end + + case response + when Net::HTTPSuccess then response + when Net::HTTPRedirection then fetch(response['location'], limit - 1) + else response.error! + end + end + + def self.process(email, pass, api_token, blog = 'primary') + @email, @pass, @api_token = email, pass, api_token + FileUtils.mkdir_p "_posts" + + posts = JSON.parse(self.fetch("/api/v2/users/me/sites/#{blog}/posts?api_token=#{@api_token}").body) + page = 1 + + while posts.any? + posts.each do |post| + title = post["title"] + slug = title.gsub(/[^[:alnum:]]+/, '-').downcase + date = Date.parse(post["display_date"]) + content = post["body_html"] + published = !post["is_private"] + name = "%02d-%02d-%02d-%s.html" % [date.year, date.month, date.day, slug] + + # Get the relevant fields as a hash, delete empty fields and convert + # to YAML for the header + data = { + 'layout' => 'post', + 'title' => title.to_s, + 'published' => published + }.delete_if { |k,v| v.nil? || v == ''}.to_yaml + + # Write out the data and content to file + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + + page += 1 + posts = JSON.parse(self.fetch("/api/v2/users/me/sites/#{blog}/posts?api_token=#{@api_token}&page=#{page}").body) + end + end + end +end diff --git a/lib/jekyll/migrators/rss.rb b/lib/jekyll/migrators/rss.rb new file mode 100644 index 00000000..461abd35 --- /dev/null +++ b/lib/jekyll/migrators/rss.rb @@ -0,0 +1,47 @@ +# Created by Kendall Buchanan (https://github.com/kendagriff) on 2011-12-22. +# Use at your own risk. The end. +# +# Usage: +# (URL) +# ruby -r '_import/rss.rb' -e "Jekyll::MigrateRSS.process('http://yourdomain.com/your-favorite-feed.xml')" +# +# (Local file) +# ruby -r '_import/rss.rb' -e "Jekyll::MigrateRSS.process('./somefile/on/your/computer.xml')" + +require 'rubygems' +require 'rss/1.0' +require 'rss/2.0' +require 'open-uri' +require 'fileutils' +require 'yaml' + +module Jekyll + module MigrateRSS + + # The `source` argument may be a URL or a local file. + def self.process(source) + content = "" + open(source) { |s| content = s.read } + rss = RSS::Parser.parse(content, false) + + raise "There doesn't appear to be any RSS items at the source (#{source}) provided." unless rss + + rss.items.each do |item| + formatted_date = item.date.strftime('%Y-%m-%d') + post_name = item.title.split(%r{ |!|/|:|&|-|$|,}).map { |i| i.downcase if i != '' }.compact.join('-') + name = "#{formatted_date}-#{post_name}" + + header = { + 'layout' => 'post', + 'title' => item.title + } + + File.open("_posts/#{name}.html", "w") do |f| + f.puts header.to_yaml + f.puts "---\n" + f.puts item.description + end + end + end + end +end \ No newline at end of file diff --git a/lib/jekyll/migrators/textpattern.rb b/lib/jekyll/migrators/textpattern.rb new file mode 100644 index 00000000..3b370ed9 --- /dev/null +++ b/lib/jekyll/migrators/textpattern.rb @@ -0,0 +1,58 @@ +require 'rubygems' +require 'sequel' +require 'fileutils' +require 'yaml' + +# NOTE: This converter requires Sequel and the MySQL gems. +# The MySQL gem can be difficult to install on OS X. Once you have MySQL +# installed, running the following commands should work: +# $ sudo gem install sequel +# $ sudo gem install mysql -- --with-mysql-config=/usr/local/mysql/bin/mysql_config + +module Jekyll + module TextPattern + # Reads a MySQL database via Sequel and creates a post file for each post. + # The only posts selected are those with a status of 4 or 5, which means + # "live" and "sticky" respectively. + # Other statuses are 1 => draft, 2 => hidden and 3 => pending. + QUERY = "SELECT Title, \ + url_title, \ + Posted, \ + Body, \ + Keywords \ + FROM textpattern \ + WHERE Status = '4' OR \ + Status = '5'" + + def self.process(dbname, user, pass, host = 'localhost') + db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8') + + FileUtils.mkdir_p "_posts" + + db[QUERY].each do |post| + # Get required fields and construct Jekyll compatible name. + title = post[:Title] + slug = post[:url_title] + date = post[:Posted] + content = post[:Body] + + name = [date.strftime("%Y-%m-%d"), slug].join('-') + ".textile" + + # Get the relevant fields as a hash, delete empty fields and convert + # to YAML for the header. + data = { + 'layout' => 'post', + 'title' => title.to_s, + 'tags' => post[:Keywords].split(',') + }.delete_if { |k,v| v.nil? || v == ''}.to_yaml + + # Write out the data and content to file. + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + end + end +end diff --git a/lib/jekyll/migrators/tumblr.rb b/lib/jekyll/migrators/tumblr.rb new file mode 100644 index 00000000..367a83c9 --- /dev/null +++ b/lib/jekyll/migrators/tumblr.rb @@ -0,0 +1,195 @@ +require 'rubygems' +require 'open-uri' +require 'fileutils' +require 'nokogiri' +require 'date' +require 'json' +require 'uri' +require 'jekyll' + +module Jekyll + module Tumblr + def self.process(url, format = "html", grab_images = false, + add_highlights = false, rewrite_urls = true) + @grab_images = grab_images + FileUtils.mkdir_p "_posts/tumblr" + url += "/api/read/json/" + per_page = 50 + posts = [] + # Two passes are required so that we can rewrite URLs. + # First pass builds up an array of each post as a hash. + begin + current_page = (current_page || -1) + 1 + feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}") + json = feed.readlines.join("\n")[21...-2] # Strip Tumblr's JSONP chars. + blog = JSON.parse(json) + puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}" + posts += blog["posts"].map { |post| post_to_hash(post, format) } + end until blog["posts"].size < per_page + # Rewrite URLs and create redirects. + posts = rewrite_urls_and_redirects posts if rewrite_urls + # Second pass for writing post files. + posts.each do |post| + if format == "md" + post[:content] = html_to_markdown post[:content] + post[:content] = add_syntax_highlights post[:content] if add_highlights + end + File.open("_posts/tumblr/#{post[:name]}", "w") do |f| + f.puts post[:header].to_yaml + "---\n" + post[:content] + end + end + end + + private + + # Converts each type of Tumblr post to a hash with all required + # data for Jekyll. + def self.post_to_hash(post, format) + case post['type'] + when "regular" + title = post["regular-title"] + content = post["regular-body"] + when "link" + title = post["link-text"] || post["link-url"] + content = "#{title}" + unless post["link-description"].nil? + content << "
" + post["link-description"] + end + when "photo" + title = post["photo-caption"] + max_size = post.keys.map{ |k| k.gsub("photo-url-", "").to_i }.max + url = post["photo-url"] || post["photo-url-#{max_size}"] + ext = "." + post[post.keys.select { |k| + k =~ /^photo-url-/ && post[k].split("/").last =~ /\./ + }.first].split(".").last + content = "" + unless post["photo-link-url"].nil? + content = "#{content}" + end + when "audio" + if !post["id3-title"].nil? + title = post["id3-title"] + content = post.at["audio-player"] + "
" + post["audio-caption"] + else + title = post["audio-caption"] + content = post.at["audio-player"] + end + when "quote" + title = post["quote-text"] + content = "
#{post["quote-text"]}
" + unless post["quote-source"].nil? + content << "—" + post["quote-source"] + end + when "conversation" + title = post["conversation-title"] + content = "
" + post["conversation"]["line"].each do |line| + content << "
#{line['label']}
#{line}
" + end + content << "
" + when "video" + title = post["video-title"] + content = post["video-player"] + unless post["video-caption"].nil? + content << "
" + post["video-caption"] + end + end + date = Date.parse(post['date']).to_s + title = Nokogiri::HTML(title).text + slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '') + { + :name => "#{date}-#{slug}.#{format}", + :header => { + "layout" => "post", + "title" => title, + "tags" => post["tags"], + }, + :content => content, + :url => post["url"], + :slug => post["url-with-slug"], + } + end + + # Create a Hash of old urls => new urls, for rewriting and + # redirects, and replace urls in each post. Instantiate Jekyll + # site/posts to get the correct permalink format. + def self.rewrite_urls_and_redirects(posts) + site = Jekyll::Site.new(Jekyll.configuration({})) + dir = File.join(File.dirname(__FILE__), "..") + urls = Hash[posts.map { |post| + # Create an initial empty file for the post so that + # we can instantiate a post object. + File.open("_posts/tumblr/#{post[:name]}", "w") + tumblr_url = URI.parse(post[:slug]).path + jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url + redirect_dir = tumblr_url.sub(/\//, "") + "/" + FileUtils.mkdir_p redirect_dir + File.open(redirect_dir + "index.html", "w") do |f| + f.puts "" + end + [tumblr_url, jekyll_url] + }] + posts.map { |post| + urls.each do |tumblr_url, jekyll_url| + post[:content].gsub!(/#{tumblr_url}/i, jekyll_url) + end + post + } + end + + # Uses Python's html2text to convert a post's content to + # markdown. Preserve HTML tables as per the markdown docs. + def self.html_to_markdown(content) + preserve = ["table", "tr", "th", "td"] + preserve.each do |tag| + content.gsub!(/<#{tag}/i, "$$" + tag) + content.gsub!(/<\/#{tag}/i, "||" + tag) + end + content = %x[echo '#{content.gsub("'", "''")}' | html2text] + preserve.each do |tag| + content.gsub!("$$" + tag, "<" + tag) + content.gsub!("||" + tag, " +require 'fileutils' +require 'rubygems' +require 'sequel' +require 'yaml' + +module Jekyll + module Typo + # This SQL *should* work for both MySQL and PostgreSQL, but I haven't + # tested PostgreSQL yet (as of 2008-12-16). + SQL = <<-EOS + SELECT c.id id, + c.title title, + c.permalink slug, + c.body body, + c.published_at date, + c.state state, + COALESCE(tf.name, 'html') filter + FROM contents c + LEFT OUTER JOIN text_filters tf + ON c.text_filter_id = tf.id + EOS + + def self.process dbname, user, pass, host='localhost' + FileUtils.mkdir_p '_posts' + db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8') + db[SQL].each do |post| + next unless post[:state] =~ /published/ + + name = [ sprintf("%.04d", post[:date].year), + sprintf("%.02d", post[:date].month), + sprintf("%.02d", post[:date].day), + post[:slug].strip ].join('-') + + # Can have more than one text filter in this field, but we just want + # the first one for this. + name += '.' + post[:filter].split(' ')[0] + + File.open("_posts/#{name}", 'w') do |f| + f.puts({ 'layout' => 'post', + 'title' => post[:title].to_s, + 'typo_id' => post[:id] + }.delete_if { |k, v| v.nil? || v == '' }.to_yaml) + f.puts '---' + f.puts post[:body].delete("\r") + end + end + end + + end +end diff --git a/lib/jekyll/migrators/wordpress.rb b/lib/jekyll/migrators/wordpress.rb new file mode 100644 index 00000000..d2d19039 --- /dev/null +++ b/lib/jekyll/migrators/wordpress.rb @@ -0,0 +1,294 @@ +require 'rubygems' +require 'sequel' +require 'fileutils' +require 'yaml' + +# NOTE: This converter requires Sequel and the MySQL gems. +# The MySQL gem can be difficult to install on OS X. Once you have MySQL +# installed, running the following commands should work: +# $ sudo gem install sequel +# $ sudo gem install mysql -- --with-mysql-config=/usr/local/mysql/bin/mysql_config + +module Jekyll + module WordPress + + # Main migrator function. Call this to perform the migration. + # + # dbname:: The name of the database + # user:: The database user name + # pass:: The database user's password + # host:: The address of the MySQL database host. Default: 'localhost' + # options:: A hash table of configuration options. + # + # Supported options are: + # + # :table_prefix:: Prefix of database tables used by WordPress. + # Default: 'wp_' + # :clean_entities:: If true, convert non-ASCII characters to HTML + # entities in the posts, comments, titles, and + # names. Requires the 'htmlentities' gem to + # work. Default: true. + # :comments:: If true, migrate post comments too. Comments + # are saved in the post's YAML front matter. + # Default: true. + # :categories:: If true, save the post's categories in its + # YAML front matter. + # :tags:: If true, save the post's tags in its + # YAML front matter. + # :more_excerpt:: If true, when a post has no excerpt but + # does have a tag, use the + # preceding post content as the excerpt. + # Default: true. + # :more_anchor:: If true, convert a tag into + # two HTML anchors with ids "more" and + # "more-NNN" (where NNN is the post number). + # Default: true. + # :status:: Array of allowed post statuses. Only + # posts with matching status will be migrated. + # Known statuses are :publish, :draft, :private, + # and :revision. If this is nil or an empty + # array, all posts are migrated regardless of + # status. Default: [:publish]. + # + def self.process(dbname, user, pass, host='localhost', options={}) + options = { + :table_prefix => 'wp_', + :clean_entities => true, + :comments => true, + :categories => true, + :tags => true, + :more_excerpt => true, + :more_anchor => true, + :status => [:publish] # :draft, :private, :revision + }.merge(options) + + if options[:clean_entities] + begin + require 'htmlentities' + rescue LoadError + STDERR.puts "Could not require 'htmlentities', so the " + + ":clean_entities option is now disabled." + options[:clean_entities] = false + end + end + + FileUtils.mkdir_p("_posts") + + db = Sequel.mysql(dbname, :user => user, :password => pass, + :host => host, :encoding => 'utf8') + + px = options[:table_prefix] + + posts_query = " + SELECT + posts.ID AS `id`, + posts.guid AS `guid`, + posts.post_type AS `type`, + posts.post_status AS `status`, + posts.post_title AS `title`, + posts.post_name AS `slug`, + posts.post_date AS `date`, + posts.post_content AS `content`, + posts.post_excerpt AS `excerpt`, + posts.comment_count AS `comment_count`, + users.display_name AS `author`, + users.user_login AS `author_login`, + users.user_email AS `author_email`, + users.user_url AS `author_url` + FROM #{px}posts AS `posts` + LEFT JOIN #{px}users AS `users` + ON posts.post_author = users.ID" + + if options[:status] and not options[:status].empty? + status = options[:status][0] + posts_query << " + WHERE posts.post_status = '#{status.to_s}'" + options[:status][1..-1].each do |status| + posts_query << " OR + posts.post_status = '#{status.to_s}'" + end + end + + db[posts_query].each do |post| + process_post(post, db, options) + end + end + + + def self.process_post(post, db, options) + px = options[:table_prefix] + + title = post[:title] + if options[:clean_entities] + title = clean_entities(title) + end + + slug = post[:slug] + if !slug or slug.empty? + slug = sluggify(title) + end + + date = post[:date] || Time.now + name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, + date.day, slug] + content = post[:content].to_s + if options[:clean_entities] + content = clean_entities(content) + end + + excerpt = post[:excerpt].to_s + + more_index = content.index(//) + more_anchor = nil + if more_index + if options[:more_excerpt] and + (post[:excerpt].nil? or post[:excerpt].empty?) + excerpt = content[0...more_index] + end + if options[:more_anchor] + more_link = "more" + content.sub!(//, + "" + + "") + end + end + + categories = [] + tags = [] + + if options[:categories] or options[:tags] + + cquery = + "SELECT + terms.name AS `name`, + ttax.taxonomy AS `type` + FROM + #{px}terms AS `terms`, + #{px}term_relationships AS `trels`, + #{px}term_taxonomy AS `ttax` + WHERE + trels.object_id = '#{post[:id]}' AND + trels.term_taxonomy_id = ttax.term_taxonomy_id AND + terms.term_id = ttax.term_id" + + db[cquery].each do |term| + if options[:categories] and term[:type] == "category" + if options[:clean_entities] + categories << clean_entities(term[:name]) + else + categories << term[:name] + end + elsif options[:tags] and term[:type] == "post_tag" + if options[:clean_entities] + tags << clean_entities(term[:name]) + else + tags << term[:name] + end + end + end + end + + comments = [] + + if options[:comments] and post[:comment_count].to_i > 0 + cquery = + "SELECT + comment_ID AS `id`, + comment_author AS `author`, + comment_author_email AS `author_email`, + comment_author_url AS `author_url`, + comment_date AS `date`, + comment_date_gmt AS `date_gmt`, + comment_content AS `content` + FROM #{px}comments + WHERE + comment_post_ID = '#{post[:id]}' AND + comment_approved != 'spam'" + + + db[cquery].each do |comment| + + comcontent = comment[:content].to_s + if comcontent.respond_to?(:force_encoding) + comcontent.force_encoding("UTF-8") + end + if options[:clean_entities] + comcontent = clean_entities(comcontent) + end + comauthor = comment[:author].to_s + if options[:clean_entities] + comauthor = clean_entities(comauthor) + end + + comments << { + 'id' => comment[:id].to_i, + 'author' => comauthor, + 'author_email' => comment[:author_email].to_s, + 'author_url' => comment[:author_url].to_s, + 'date' => comment[:date].to_s, + 'date_gmt' => comment[:date_gmt].to_s, + 'content' => comcontent, + } + end + + comments.sort!{ |a,b| a['id'] <=> b['id'] } + end + + # Get the relevant fields as a hash, delete empty fields and + # convert to YAML for the header. + data = { + 'layout' => post[:type].to_s, + 'status' => post[:status].to_s, + 'published' => (post[:status].to_s == "publish"), + 'title' => title.to_s, + 'author' => post[:author].to_s, + 'author_login' => post[:author_login].to_s, + 'author_email' => post[:author_email].to_s, + 'author_url' => post[:author_url].to_s, + 'excerpt' => excerpt, + 'more_anchor' => more_anchor, + 'wordpress_id' => post[:id], + 'wordpress_url' => post[:guid].to_s, + 'date' => date, + 'categories' => options[:categories] ? categories : nil, + 'tags' => options[:tags] ? tags : nil, + 'comments' => options[:comments] ? comments : nil, + }.delete_if { |k,v| v.nil? || v == '' }.to_yaml + + # Write out the data and content to file + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + + + def self.clean_entities( text ) + if text.respond_to?(:force_encoding) + text.force_encoding("UTF-8") + end + text = HTMLEntities.new.encode(text, :named) + # We don't want to convert these, it would break all + # HTML tags in the post and comments. + text.gsub!("&", "&") + text.gsub!("<", "<") + text.gsub!(">", ">") + text.gsub!(""", '"') + text.gsub!("'", "'") + text + end + + + def self.sluggify( title ) + begin + require 'unidecode' + title = title.to_ascii + rescue LoadError + STDERR.puts "Could not require 'unidecode'. If your post titles have non-ASCII characters, you could get nicer permalinks by installing unidecode." + end + title.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-") + end + + end +end diff --git a/lib/jekyll/migrators/wordpressdotcom.rb b/lib/jekyll/migrators/wordpressdotcom.rb new file mode 100644 index 00000000..701c2af4 --- /dev/null +++ b/lib/jekyll/migrators/wordpressdotcom.rb @@ -0,0 +1,70 @@ +# coding: utf-8 + +require 'rubygems' +require 'hpricot' +require 'fileutils' +require 'yaml' +require 'time' + +module Jekyll + # This importer takes a wordpress.xml file, which can be exported from your + # wordpress.com blog (/wp-admin/export.php). + module WordpressDotCom + def self.process(filename = "wordpress.xml") + import_count = Hash.new(0) + doc = Hpricot::XML(File.read(filename)) + + (doc/:channel/:item).each do |item| + title = item.at(:title).inner_text.strip + permalink_title = item.at('wp:post_name').inner_text + # Fallback to "prettified" title if post_name is empty (can happen) + if permalink_title == "" + permalink_title = title.downcase.split.join('-') + end + + date = Time.parse(item.at('wp:post_date').inner_text) + status = item.at('wp:status').inner_text + + if status == "publish" + published = true + else + published = false + end + + type = item.at('wp:post_type').inner_text + tags = (item/:category).map{|c| c.inner_text}.reject{|c| c == 'Uncategorized'}.uniq + + metas = Hash.new + item.search("wp:postmeta").each do |meta| + key = meta.at('wp:meta_key').inner_text + value = meta.at('wp:meta_value').inner_text + metas[key] = value; + end + + name = "#{date.strftime('%Y-%m-%d')}-#{permalink_title}.html" + header = { + 'layout' => type, + 'title' => title, + 'tags' => tags, + 'status' => status, + 'type' => type, + 'published' => published, + 'meta' => metas + } + + FileUtils.mkdir_p "_#{type}s" + File.open("_#{type}s/#{name}", "w") do |f| + f.puts header.to_yaml + f.puts '---' + f.puts item.at('content:encoded').inner_text + end + + import_count[type] += 1 + end + + import_count.each do |key, value| + puts "Imported #{value} #{key}s" + end + end + end +end