sitemap_generator.rb 7.89 KB
Newer Older
Riccardo Spagni's avatar
Riccardo Spagni committed
1 2 3 4 5 6 7 8 9 10 11 12
# Sitemap.xml Generator is a Jekyll plugin that generates a sitemap.xml file by 
# traversing all of the available posts and pages.
# 
# See readme file for documenation
# 
# Updated to use config file for settings by Daniel Groves
# Site: http://danielgroves.net
# 
# Author: Michael Levin
# Site: http://www.kinnetica.com
# Distributed Under A Creative Commons License
#   - http://creativecommons.org/licenses/by/3.0/
Mike McDonald's avatar
Mike McDonald committed
13
require 'jekyll/document'
Riccardo Spagni's avatar
Riccardo Spagni committed
14 15 16 17
require 'rexml/document'

module Jekyll

Mike McDonald's avatar
Mike McDonald committed
18
  class Jekyll::Document
Riccardo Spagni's avatar
Riccardo Spagni committed
19 20 21
    attr_accessor :name

    def path_to_source
Mike McDonald's avatar
Mike McDonald committed
22
      File.join(*[@name].compact)
Riccardo Spagni's avatar
Riccardo Spagni committed
23 24 25
    end

    def location_on_server(my_url)
Mike McDonald's avatar
Mike McDonald committed
26
      "#{my_url}#{url}"
Riccardo Spagni's avatar
Riccardo Spagni committed
27 28 29 30 31 32 33
    end
  end

  class Page
    attr_accessor :name

    def path_to_source
Mike McDonald's avatar
Mike McDonald committed
34
      File.join(*[@dir, @name].compact)
Riccardo Spagni's avatar
Riccardo Spagni committed
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
    end

    def location_on_server(my_url)
      location = "#{my_url}#{url}"
      location.gsub(/index.html$/, "")
    end
  end

  # Recover from strange exception when starting server without --auto
  class SitemapFile < StaticFile
    def write(dest)
      true
    end
  end

  class SitemapGenerator < Generator
    priority :lowest

    # Config defaults
    SITEMAP_FILE_NAME = "/sitemap.xml"
    EXCLUDE = ["/atom.xml", "/feed.xml", "/feed/index.xml"]
    INCLUDE_POSTS = ["/index.html"] 
    CHANGE_FREQUENCY_NAME = "change_frequency"
    PRIORITY_NAME = "priority"
    
    # Valid values allowed by sitemap.xml spec for change frequencies
    VALID_CHANGE_FREQUENCY_VALUES = ["always", "hourly", "daily", "weekly",
      "monthly", "yearly", "never"] 

    # Goes through pages and posts and generates sitemap.xml file
    #
    # Returns nothing
    def generate(site)
      # Configuration
      sitemap_config = site.config['sitemap'] || {}
      @config = {}
      @config['filename'] = sitemap_config['filename'] || SITEMAP_FILE_NAME
      @config['change_frequency_name'] = sitemap_config['change_frequency_name'] || CHANGE_FREQUENCY_NAME
      @config['priority_name'] = sitemap_config['priority_name'] || PRIORITY_NAME
      @config['exclude'] = sitemap_config['exclude'] || EXCLUDE
      @config['include_posts'] = sitemap_config['include_posts'] || INCLUDE_POSTS

      sitemap = REXML::Document.new << REXML::XMLDecl.new("1.0", "UTF-8")

      urlset = REXML::Element.new "urlset"
      urlset.add_attribute("xmlns", 
        "http://www.sitemaps.org/schemas/sitemap/0.9")

      @last_modified_post_date = fill_posts(site, urlset)
      fill_pages(site, urlset)

      sitemap.add_element(urlset)

      # Create destination directory if it doesn't exist yet. Otherwise, we cannot write our file there.
      Dir::mkdir(site.dest) if !File.directory? site.dest

      # File I/O: create sitemap.xml file and write out pretty-printed XML
      filename = @config['filename']
      file = File.new(File.join(site.dest, filename), "w")
      formatter = REXML::Formatters::Pretty.new(4)
      formatter.compact = true
      formatter.write(sitemap, file)
      file.close

      # Keep the sitemap.xml file from being cleaned by Jekyll
      site.static_files << Jekyll::SitemapFile.new(site, site.dest, "/", filename)
    end

    # Create url elements for all the posts and find the date of the latest one
    #
    # Returns last_modified_date of latest post
    def fill_posts(site, urlset)
Mike McDonald's avatar
Mike McDonald committed
107

Riccardo Spagni's avatar
Riccardo Spagni committed
108
      last_modified_date = nil
Mike McDonald's avatar
Mike McDonald committed
109
      site.collections["posts"].docs.each do |post|
Riccardo Spagni's avatar
Riccardo Spagni committed
110 111 112 113 114
        if !excluded?(site, post.name)
          url = fill_url(site, post)
          urlset.add_element(url)
        end

Mike McDonald's avatar
Mike McDonald committed
115
        date = File.mtime(post.path)
Riccardo Spagni's avatar
Riccardo Spagni committed
116 117 118 119 120 121 122 123 124 125 126 127 128
        last_modified_date = date if last_modified_date == nil or date > last_modified_date
      end

      last_modified_date
    end

    # Create url elements for all the normal pages and find the date of the
    # index to use with the pagination pages
    #
    # Returns last_modified_date of index page
    def fill_pages(site, urlset)
      site.pages.each do |page|
        if !excluded?(site, page.path_to_source)
Mike McDonald's avatar
Mike McDonald committed
129
          if File.exists?(page.path)
Riccardo Spagni's avatar
Riccardo Spagni committed
130 131 132 133 134 135 136
            url = fill_url(site, page)
            urlset.add_element(url)
          end
        end
      end
    end

Mike McDonald's avatar
Mike McDonald committed
137
    # Fill data of each URL element: location, last modified,
Riccardo Spagni's avatar
Riccardo Spagni committed
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
    # change frequency (optional), and priority.
    #
    # Returns url REXML::Element
    def fill_url(site, page_or_post)
      url = REXML::Element.new "url"

      loc = fill_location(site, page_or_post)
      url.add_element(loc)

      lastmod = fill_last_modified(site, page_or_post)
      url.add_element(lastmod) if lastmod



      if (page_or_post.data[@config['change_frequency_name']])
        change_frequency = 
          page_or_post.data[@config['change_frequency_name']].downcase
          
        if (valid_change_frequency?(change_frequency))
          changefreq = REXML::Element.new "changefreq"
          changefreq.text = change_frequency
          url.add_element(changefreq)
        else
          puts "ERROR: Invalid Change Frequency In #{page_or_post.name}"
        end
      end

      if (page_or_post.data[@config['priority_name']])
        priority_value = page_or_post.data[@config['priority_name']]
        if valid_priority?(priority_value)
          priority = REXML::Element.new "priority"
          priority.text = page_or_post.data[@config['priority_name']]
          url.add_element(priority)
        else
          puts "ERROR: Invalid Priority In #{page_or_post.name}"
        end
      end

      url
    end

    # Get URL location of page or post 
    #
    # Returns the location of the page or post
    def fill_location(site, page_or_post)
      loc = REXML::Element.new "loc"
      url = site.config['url'] + site.config['baseurl']
Mike McDonald's avatar
Mike McDonald committed
185
      loc.text = page_or_post.location_on_server(url)
Riccardo Spagni's avatar
Riccardo Spagni committed
186 187 188 189 190 191 192 193 194

      loc
    end

    # Fill lastmod XML element with the last modified date for the page or post.
    #
    # Returns lastmod REXML::Element or nil
    def fill_last_modified(site, page_or_post)
      lastmod = REXML::Element.new "lastmod"
Mike McDonald's avatar
Mike McDonald committed
195
      date = File.mtime(page_or_post.path)
Riccardo Spagni's avatar
Riccardo Spagni committed
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
      latest_date = find_latest_date(date, site, page_or_post)

      if @last_modified_post_date == nil
        # This is a post
        lastmod.text = latest_date.iso8601
      else
        # This is a page
        if posts_included?(site, page_or_post.path_to_source)
          # We want to take into account the last post date
          final_date = greater_date(latest_date, @last_modified_post_date)
          lastmod.text = final_date.iso8601
        else
          lastmod.text = latest_date.iso8601
        end
      end
      lastmod
    end

    # Go through the page/post and any implemented layouts and get the latest
    # modified date
    #
    # Returns formatted output of latest date of page/post and any used layouts
    def find_latest_date(latest_date, site, page_or_post)
      layouts = site.layouts
      layout = layouts[page_or_post.data["layout"]]
      while layout
Mike McDonald's avatar
Mike McDonald committed
222
        date = File.mtime(layout.path)
Riccardo Spagni's avatar
Riccardo Spagni committed
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274

        latest_date = date if (date > latest_date)

        layout = layouts[layout.data["layout"]]
      end

      latest_date
    end

    # Which of the two dates is later
    #
    # Returns latest of two dates
    def greater_date(date1, date2)
      if (date1 >= date2) 
        date1
      else 
        date2 
      end
    end

    # Is the page or post listed as something we want to exclude?
    #
    # Returns boolean
    def excluded?(site, name)
      @config['exclude'].include? name
    end

    def posts_included?(site, name)
      @config['include_posts'].include? name
    end

    # Is the change frequency value provided valid according to the spec
    #
    # Returns boolean
    def valid_change_frequency?(change_frequency)
      VALID_CHANGE_FREQUENCY_VALUES.include? change_frequency
    end

    # Is the priority value provided valid according to the spec
    #
    # Returns boolean
    def valid_priority?(priority)
      begin
        priority_val = Float(priority)
        return true if priority_val >= 0.0 and priority_val <= 1.0
      rescue ArgumentError
      end

      false
    end
  end
end