django sitemap分页及适配百度、神马、搜狗搜索引擎的方法

django sitemap-article_2.xml 分页方法

本文主要讲一下sitemap分页如果自定义url格式

Django Sitemap 中的URL样式为:

https://www.soogor.com/sitemap-article.xml
https://www.soogor.com/sitemap-article.xml?p=2

像“https://www.soogor.com/sitemap-article.xml?p=2”这样的格式在百度、神马等搜索引擎是提交不了的,因为要求必要以.xml结尾。

所以我们就要改变一下URL

在之前做sitemap时,我们引入了from django.contrib.sitemaps import views as sitemaps_views ,所以我们就查看一下这里的views,位于

venv>Lib>site-packages>django>contrib>sitemaps>views.py

下面我们将这里进行修改。

我们在项目配置文件夹下新建sgsitemap.py文件(也就是settings.py所在的文件夹)

sgsitemap.py

import datetime
from functools import wraps

from django.contrib.sites.shortcuts import get_current_site
from django.core.paginator import EmptyPage, PageNotAnInteger
from django.http import Http404
from django.template.response import TemplateResponse
from django.urls import reverse
from django.utils import timezone
from django.utils.http import http_date


def x_robots_tag(func):
    @wraps(func)
    def inner(request, *args, **kwargs):
        response = func(request, *args, **kwargs)
        response.headers["X-Robots-Tag"] = "noindex, noodp, noarchive"
        return response

    return inner


@x_robots_tag
def index(
    request,
    sitemaps,
    template_name="sitemap_index.xml",
    content_type="application/xml",
    sitemap_url_name="django.contrib.sitemaps.views.sitemap",
):

    req_protocol = request.scheme
    req_site = get_current_site(request)

    sites = []  # all sections' sitemap URLs
    for section, site in sitemaps.items():
        # For each section label, add links of all pages of its sitemap
        # (usually generated by the `sitemap` view).
        if callable(site):
            site = site()
        protocol = req_protocol if site.protocol is None else site.protocol
        sitemap_url = reverse(sitemap_url_name, kwargs={"section": section})
        absolute_url = "%s://%s%s" % (protocol, req_site.domain, sitemap_url)
        sites.append(absolute_url)
        # Add links to all pages of the sitemap.
        for page in range(2, site.paginator.num_pages + 1):
#开始定义URL样式
            if '_' in section:#区分section字串符
                section=section.split('_')[0]
            section='{}_{}'.format(section,page)#建立新的样式
            sitemap_url = reverse(sitemap_url_name, kwargs={"section": section})
            absolute_url = "%s://%s%s" % (protocol, req_site.domain, sitemap_url)
            sites.append(absolute_url)

    return TemplateResponse(
        request, template_name, {"sitemaps": sites}, content_type=content_type
    )


@x_robots_tag
def sitemap(
    request,
    sitemaps,
    section=None,
    template_name="sitemap.xml",
    content_type="application/xml",
):

    req_protocol = request.scheme
    req_site = get_current_site(request)
    page = request.GET.get("p", 1)
#将自定义URL section 拆分取得对应的变量值
    if '_' in section:
        section_res=section
        section=section_res.split('_')[0]
        page = section_res.split('_')[1]
        # print(section,page)
    

    if section is not None:
        if section not in sitemaps:
            raise Http404("No sitemap available for section: %r" % section)
        maps = [sitemaps[section]]
    else:
        maps = sitemaps.values()
    

    lastmod = None
    all_sites_lastmod = True
    urls = []
    for site in maps:
        try:
            if callable(site):
                site = site()
            urls.extend(site.get_urls(page=page, site=req_site, protocol=req_protocol))
            if all_sites_lastmod:
                site_lastmod = getattr(site, "latest_lastmod", None)
                if site_lastmod is not None:
                    if not isinstance(site_lastmod, datetime.datetime):
                        site_lastmod = datetime.datetime.combine(
                            site_lastmod, datetime.time.min
                        )
                    if timezone.is_naive(site_lastmod):
                        site_lastmod = timezone.make_aware(site_lastmod, timezone.utc)
                    lastmod = (
                        site_lastmod if lastmod is None else max(lastmod, site_lastmod)
                    )
                else:
                    all_sites_lastmod = False
        except EmptyPage:
            raise Http404("Page %s empty" % page)
        except PageNotAnInteger:
            raise Http404("No page '%s'" % page)
    response = TemplateResponse(
        request, template_name, {"urlset": urls}, content_type=content_type
    )
    if all_sites_lastmod and lastmod is not None:
        # if lastmod is defined for all sites, set header so as
        # ConditionalGetMiddleware is able to send 304 NOT MODIFIED
        response.headers["Last-Modified"] = http_date(lastmod.timestamp())
    return response

然后我们再修改urls.py

urls.py

from . import sgsitemap#引入我们刚才制作的视图

urlpatterns += [
    path('sitemap.xml',
         cache_page(86400)(sgsitemap.index), {#sgsitemap.index,通过这里来改变分页样式
             'sitemaps': sitemaps,
             'sitemap_url_name': 'sitemaps'
         }),
    path('sitemap-<section>.xml',
         cache_page(86400)(sgsitemap.sitemap), {'sitemaps': sitemaps},
         name='sitemaps'),
]

 

THE END