diff --git a/lib/jekyll/migrators/wordpress.rb b/lib/jekyll/migrators/wordpress.rb index 535859c9..d2d19039 100644 --- a/lib/jekyll/migrators/wordpress.rb +++ b/lib/jekyll/migrators/wordpress.rb @@ -11,52 +11,284 @@ require 'yaml' module Jekyll module WordPress - def self.process(dbname, user, pass, host = 'localhost', table_prefix = 'wp_') - db = Sequel.mysql(dbname, :user => user, :password => pass, :host => host, :encoding => 'utf8') + + # Main migrator function. Call this to perform the migration. + # + # dbname:: The name of the database + # user:: The database user name + # pass:: The database user's password + # host:: The address of the MySQL database host. Default: 'localhost' + # options:: A hash table of configuration options. + # + # Supported options are: + # + # :table_prefix:: Prefix of database tables used by WordPress. + # Default: 'wp_' + # :clean_entities:: If true, convert non-ASCII characters to HTML + # entities in the posts, comments, titles, and + # names. Requires the 'htmlentities' gem to + # work. Default: true. + # :comments:: If true, migrate post comments too. Comments + # are saved in the post's YAML front matter. + # Default: true. + # :categories:: If true, save the post's categories in its + # YAML front matter. + # :tags:: If true, save the post's tags in its + # YAML front matter. + # :more_excerpt:: If true, when a post has no excerpt but + # does have a tag, use the + # preceding post content as the excerpt. + # Default: true. + # :more_anchor:: If true, convert a tag into + # two HTML anchors with ids "more" and + # "more-NNN" (where NNN is the post number). + # Default: true. + # :status:: Array of allowed post statuses. Only + # posts with matching status will be migrated. + # Known statuses are :publish, :draft, :private, + # and :revision. If this is nil or an empty + # array, all posts are migrated regardless of + # status. Default: [:publish]. + # + def self.process(dbname, user, pass, host='localhost', options={}) + options = { + :table_prefix => 'wp_', + :clean_entities => true, + :comments => true, + :categories => true, + :tags => true, + :more_excerpt => true, + :more_anchor => true, + :status => [:publish] # :draft, :private, :revision + }.merge(options) + + if options[:clean_entities] + begin + require 'htmlentities' + rescue LoadError + STDERR.puts "Could not require 'htmlentities', so the " + + ":clean_entities option is now disabled." + options[:clean_entities] = false + end + end FileUtils.mkdir_p("_posts") - # Reads a MySQL database via Sequel and creates a post file for each - # post in wp_posts that has post_status = 'publish'. This restriction is - # made because 'draft' posts are not guaranteed to have valid dates. - query = "SELECT post_title, \ - post_name, \ - post_date, \ - post_content, \ - post_excerpt, \ - ID, \ - guid \ - FROM #{table_prefix}posts \ - WHERE post_status = 'publish' AND \ - post_type = 'post'" + db = Sequel.mysql(dbname, :user => user, :password => pass, + :host => host, :encoding => 'utf8') - db[query].each do |post| - # Get required fields and construct Jekyll compatible name. - title = post[:post_title] - slug = post[:post_name] - date = post[:post_date] - content = post[:post_content] - name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, date.day, - slug] + px = options[:table_prefix] - # Get the relevant fields as a hash, delete empty fields and convert - # to YAML for the header. - data = { - 'layout' => 'post', - 'title' => title.to_s, - 'excerpt' => post[:post_excerpt].to_s, - 'wordpress_id' => post[:ID], - 'wordpress_url' => post[:guid], - 'date' => date - }.delete_if { |k,v| v.nil? || v == '' }.to_yaml + posts_query = " + SELECT + posts.ID AS `id`, + posts.guid AS `guid`, + posts.post_type AS `type`, + posts.post_status AS `status`, + posts.post_title AS `title`, + posts.post_name AS `slug`, + posts.post_date AS `date`, + posts.post_content AS `content`, + posts.post_excerpt AS `excerpt`, + posts.comment_count AS `comment_count`, + users.display_name AS `author`, + users.user_login AS `author_login`, + users.user_email AS `author_email`, + users.user_url AS `author_url` + FROM #{px}posts AS `posts` + LEFT JOIN #{px}users AS `users` + ON posts.post_author = users.ID" - # Write out the data and content to file - File.open("_posts/#{name}", "w") do |f| - f.puts data - f.puts "---" - f.puts content + if options[:status] and not options[:status].empty? + status = options[:status][0] + posts_query << " + WHERE posts.post_status = '#{status.to_s}'" + options[:status][1..-1].each do |status| + posts_query << " OR + posts.post_status = '#{status.to_s}'" end end + + db[posts_query].each do |post| + process_post(post, db, options) + end end + + + def self.process_post(post, db, options) + px = options[:table_prefix] + + title = post[:title] + if options[:clean_entities] + title = clean_entities(title) + end + + slug = post[:slug] + if !slug or slug.empty? + slug = sluggify(title) + end + + date = post[:date] || Time.now + name = "%02d-%02d-%02d-%s.markdown" % [date.year, date.month, + date.day, slug] + content = post[:content].to_s + if options[:clean_entities] + content = clean_entities(content) + end + + excerpt = post[:excerpt].to_s + + more_index = content.index(//) + more_anchor = nil + if more_index + if options[:more_excerpt] and + (post[:excerpt].nil? or post[:excerpt].empty?) + excerpt = content[0...more_index] + end + if options[:more_anchor] + more_link = "more" + content.sub!(//, + "" + + "") + end + end + + categories = [] + tags = [] + + if options[:categories] or options[:tags] + + cquery = + "SELECT + terms.name AS `name`, + ttax.taxonomy AS `type` + FROM + #{px}terms AS `terms`, + #{px}term_relationships AS `trels`, + #{px}term_taxonomy AS `ttax` + WHERE + trels.object_id = '#{post[:id]}' AND + trels.term_taxonomy_id = ttax.term_taxonomy_id AND + terms.term_id = ttax.term_id" + + db[cquery].each do |term| + if options[:categories] and term[:type] == "category" + if options[:clean_entities] + categories << clean_entities(term[:name]) + else + categories << term[:name] + end + elsif options[:tags] and term[:type] == "post_tag" + if options[:clean_entities] + tags << clean_entities(term[:name]) + else + tags << term[:name] + end + end + end + end + + comments = [] + + if options[:comments] and post[:comment_count].to_i > 0 + cquery = + "SELECT + comment_ID AS `id`, + comment_author AS `author`, + comment_author_email AS `author_email`, + comment_author_url AS `author_url`, + comment_date AS `date`, + comment_date_gmt AS `date_gmt`, + comment_content AS `content` + FROM #{px}comments + WHERE + comment_post_ID = '#{post[:id]}' AND + comment_approved != 'spam'" + + + db[cquery].each do |comment| + + comcontent = comment[:content].to_s + if comcontent.respond_to?(:force_encoding) + comcontent.force_encoding("UTF-8") + end + if options[:clean_entities] + comcontent = clean_entities(comcontent) + end + comauthor = comment[:author].to_s + if options[:clean_entities] + comauthor = clean_entities(comauthor) + end + + comments << { + 'id' => comment[:id].to_i, + 'author' => comauthor, + 'author_email' => comment[:author_email].to_s, + 'author_url' => comment[:author_url].to_s, + 'date' => comment[:date].to_s, + 'date_gmt' => comment[:date_gmt].to_s, + 'content' => comcontent, + } + end + + comments.sort!{ |a,b| a['id'] <=> b['id'] } + end + + # Get the relevant fields as a hash, delete empty fields and + # convert to YAML for the header. + data = { + 'layout' => post[:type].to_s, + 'status' => post[:status].to_s, + 'published' => (post[:status].to_s == "publish"), + 'title' => title.to_s, + 'author' => post[:author].to_s, + 'author_login' => post[:author_login].to_s, + 'author_email' => post[:author_email].to_s, + 'author_url' => post[:author_url].to_s, + 'excerpt' => excerpt, + 'more_anchor' => more_anchor, + 'wordpress_id' => post[:id], + 'wordpress_url' => post[:guid].to_s, + 'date' => date, + 'categories' => options[:categories] ? categories : nil, + 'tags' => options[:tags] ? tags : nil, + 'comments' => options[:comments] ? comments : nil, + }.delete_if { |k,v| v.nil? || v == '' }.to_yaml + + # Write out the data and content to file + File.open("_posts/#{name}", "w") do |f| + f.puts data + f.puts "---" + f.puts content + end + end + + + def self.clean_entities( text ) + if text.respond_to?(:force_encoding) + text.force_encoding("UTF-8") + end + text = HTMLEntities.new.encode(text, :named) + # We don't want to convert these, it would break all + # HTML tags in the post and comments. + text.gsub!("&", "&") + text.gsub!("<", "<") + text.gsub!(">", ">") + text.gsub!(""", '"') + text.gsub!("'", "'") + text + end + + + def self.sluggify( title ) + begin + require 'unidecode' + title = title.to_ascii + rescue LoadError + STDERR.puts "Could not require 'unidecode'. If your post titles have non-ASCII characters, you could get nicer permalinks by installing unidecode." + end + title.downcase.gsub(/[^0-9A-Za-z]+/, " ").strip.gsub(" ", "-") + end + end end