2团
Published on 2026-03-09 / 0 Visits
0
0

基于Rust重构Bing IndexNow自动推送工具

1. 项目背景

IndexNow是由微软Bing发起的一项开源协议,旨在让网站所有者能够更快速地将新内容或更新内容通知给搜索引擎。通过IndexNow API推送URL,可以加速Bing搜索引擎的收录速度。

这个项目最初是为我的博客网站 2团日志 开发的,目的是快速推送新文章到搜索引擎,提升SEO效果。最初使用Java实现,最近发现云主机资源紧张,这个功能又很简单,因此决定尝试下Rust,尽可能降低了内存占用。

2. 技术实现

2.1 项目依赖

项目使用 Rust 编写,主要依赖以下库:

[dependencies]
anyhow = "1.0"          # 错误处理
quick-xml = "0.31"      # XML 解析
reqwest = "0.11"        # HTTP 客户端
serde = { version = "1.0", features = ["derive"] }  # 序列化
serde_yaml = "0.9"      # YAML 配置解析
tokio = { version = "1.0", features = ["full"] }   # 异步运行时
tracing = "0.1"         # 日志
tracing-subscriber = "0.3"

2.2 核心数据结构

首先定义配置和API请求的数据结构:

use serde::{Deserialize, Serialize};

#[derive(Debug, Deserialize)]
struct Config {
    app: AppConfig,
}

#[derive(Debug, Deserialize)]
struct AppConfig {
    hosts: Vec<Host>,
}

#[derive(Debug, Clone, Deserialize)]
struct Host {
    host: String,
    key: String,
    #[serde(rename = "key-location")]
    key_location: String,
}

#[derive(Debug, Serialize)]
struct IndexNowPayload {
    host: String,
    key: String,
    #[serde(rename = "keyLocation")]
    key_location: String,
    #[serde(rename = "urlList")]
    url_list: Vec<String>,
}

Host 结构体存储每个站点的配置信息,包括域名、IndexNow 密钥和密钥验证文件位置。IndexNowPayload 是推送到 API 的请求体格式。

2.3 主程序入口

主程序使用 Tokio 异步运行时,初始化日志并启动服务:

use std::time::Duration;
use tokio::time::sleep;
use tracing::{error, info};

const SCHEDULE_INTERVAL: Duration = Duration::from_secs(60 * 60 * 12);

#[tokio::main]
async fn main() {
    tracing_subscriber::fmt()
        .with_env_filter(
            tracing_subscriber::EnvFilter::try_from_default_env()
                .unwrap_or_else(|_| "info".into()),
        )
        .init();

    if let Err(error) = run().await {
        error!("服务启动失败: {error:#}");
        std::process::exit(1);
    }
}

async fn run() -> Result<()> {
    let config_path = resolve_config_path()?;
    let config = load_config(&config_path)?;
    let client = Client::builder()
        .user_agent("blog-indexnow/0.1")
        .timeout(Duration::from_secs(30))
        .build()
        .context("初始化 HTTP 客户端失败")?;
    let mut host_fingerprints: HashMap<String, u64> = HashMap::new();

    info!("配置加载成功: {}", config_path.display());
    loop {
        push_bing_index(&client, &config, &mut host_fingerprints).await;
        sleep(SCHEDULE_INTERVAL).await;
    }
}

程序每12小时执行一次推送任务,使用 HashMap 存储每个站点的URL指纹,用于检测内容变化。

2.4 获取并解析 Sitemap

获取 sitemap.xml 并解析其中的 URL:

use quick_xml::events::Event;
use quick_xml::Reader;

async fn get_sitemap_xml(client: &Client, host: &Host) -> Result<String> {
    let sitemap_url = format!("https://{}/sitemap.xml", host.host);
    let response = client
        .get(&sitemap_url)
        .send()
        .await
        .with_context(|| format!("请求 sitemap 失败: {sitemap_url}"))?;

    let status = response.status();
    if !status.is_success() {
        anyhow::bail!("请求 sitemap 非成功状态: {status}");
    }

    response
        .text()
        .await
        .with_context(|| format!("读取 sitemap 响应失败: {sitemap_url}"))
}

fn extract_urls_from_sitemap(xml: &str) -> Vec<String> {
    let mut reader = Reader::from_str(xml);
    reader.trim_text(true);

    let mut buf = Vec::new();
    let mut urls = Vec::new();
    let mut in_loc = false;

    loop {
        match reader.read_event_into(&mut buf) {
            Ok(Event::Start(element)) => {
                in_loc = element.local_name().as_ref() == b"loc";
            }
            Ok(Event::End(element)) => {
                if element.local_name().as_ref() == b"loc" {
                    in_loc = false;
                }
            }
            Ok(Event::Text(text)) => {
                if in_loc {
                    if let Ok(value) = text.unescape() {
                        let value = value.trim();
                        if !value.is_empty() {
                            urls.push(value.to_string());
                        }
                    }
                }
            }
            Ok(Event::Eof) => break,
            Err(error) => {
                warn!("解析 sitemap 失败,返回已解析部分 URL: {error}");
                break;
            }
            _ => {}
        }
        buf.clear();
    }

    urls
}

使用 quick-xml 库高效解析 XML,提取 <loc> 标签中的 URL 内容。

2.5 URL 去重和指纹检测

为了避免重复推送,实现了 URL 去重和内容变化检测:

use std::collections::{hash_map::DefaultHasher, HashSet};
use std::hash::{Hash, Hasher};

fn normalize_urls(urls: Vec<String>) -> Vec<String> {
    let mut seen = HashSet::new();
    let mut normalized = Vec::new();

    for url in urls {
        if seen.insert(url.clone()) {
            normalized.push(url);
        }
    }

    normalized
}

fn fingerprint_urls(urls: &[String]) -> u64 {
    let mut hasher = DefaultHasher::new();
    urls.hash(&mut hasher);
    hasher.finish()
}

normalize_urls 使用 HashSet 去除重复 URL,fingerprint_urls 计算所有 URL 的哈希指纹,用于检测 sitemap 内容是否发生变化。

6. 推送到 IndexNow API

将 URL 批量推送到 Bing IndexNow API:

const INDEXNOW_BATCH_SIZE: usize = 10_000;
const PUSH_MAX_RETRIES: u32 = 3;

async fn push_index_to_bing(client: &Client, host: &Host, urls: &[String]) -> Result<()> {
    for (batch_index, url_batch) in urls.chunks(INDEXNOW_BATCH_SIZE).enumerate() {
        let payload = IndexNowPayload {
            host: host.host.clone(),
            key: host.key.clone(),
            key_location: host.key_location.clone(),
            url_list: url_batch.to_vec(),
        };

        let content = serde_json::to_string(&payload).context("序列化 IndexNow 请求失败")?;
        let mut retry = 0;
        loop {
            let response = client
                .post("https://api.indexnow.org/IndexNow")
                .header("content-type", "application/json; charset=utf-8")
                .body(content.clone())
                .send()
                .await
                .context("请求 Bing IndexNow 失败")?;

            let status = response.status();
            let body = response.text().await.unwrap_or_default();
            if status.is_success() {
                info!(
                    "推送成功 host={} batch={} size={}",
                    host.host,
                    batch_index + 1,
                    url_batch.len()
                );
                break;
            }

            retry += 1;
            if retry >= PUSH_MAX_RETRIES {
                anyhow::bail!(
                    "推送失败 host={} batch={} status={} body={}",
                    host.host,
                    batch_index + 1,
                    status,
                    body
                );
            }

            warn!(
                "推送失败重试 host={} batch={} retry={}/{} status={} body={}",
                host.host,
                batch_index + 1,
                retry,
                PUSH_MAX_RETRIES,
                status,
                body
            );
            sleep(Duration::from_secs(2_u64.pow(retry))).await;
        }
    }

    Ok(())
}

每次最多推送10,000个 URL,失败时采用指数退避策略重试,最多重试3次。

2.7 完整推送流程

整合所有功能的主推送逻辑:

async fn push_bing_index(client: &Client, config: &Config, host_fingerprints: &mut HashMap<String, u64>) {
    for host in &config.app.hosts {
        match get_sitemap_xml(client, host).await {
            Ok(xml) => {
                let urls = normalize_urls(extract_urls_from_sitemap(&xml));
                if urls.is_empty() {
                    warn!("站点 {} sitemap 未解析到 URL,跳过推送", host.host);
                    continue;
                }

                let current_fingerprint = fingerprint_urls(&urls);
                if host_fingerprints.get(&host.host) == Some(&current_fingerprint) {
                    info!("站点 {} sitemap 内容未变化,跳过本轮推送", host.host);
                    continue;
                }

                info!("站点 {} 共解析到 {} 条 URL,按批次推送", host.host, urls.len());
                if let Err(error) = push_index_to_bing(client, host, &urls).await {
                    error!("推送索引失败 host={}: {error:#}", host.host);
                    continue;
                }

                host_fingerprints.insert(host.host.clone(), current_fingerprint);
            }
            Err(error) => error!("获取网站地图失败 host={}: {error:#}", host.host),
        }
    }
}

3. 配置文件说明

配置文件使用YAML格式,支持配置多个站点:

app:
  hosts:
    - host: www.2tuan.work
      key: your-indexnow-key-here
      key-location: https://www.2tuan.work/your-indexnow-key-here.txt

配置项说明:

  • host:网站域名

  • key:IndexNow 密钥(8-128 个十六进制字符)

  • key-location:密钥验证文件的完整URL

其实就是Spring Boot项目的配置文件,直接挪用懒得修改了。

4. 部署建议

创建 Dockerfile

FROM registry.cn-chengdu.aliyuncs.com/jxd134/rust:1.75-24.04_stable as builder
WORKDIR /app
COPY . .
RUN cargo build --release

FROM alpine:latest
RUN apk --no-cache add ca-certificates
WORKDIR /app
COPY --from=builder /app/target/release/blog-indexnow /app/blog-indexnow
COPY config /app/config
CMD ["./blog-indexnow"]

使用 docker-compose 运行:

version: '3.8'
services:
  indexnow:
    build:
      context: .
      dockerfile: deploy/Dockerfile
    container_name: blog-indexnow
    restart: unless-stopped
    environment:
      - APP_CONFIG=/app/config/application.yaml

5. 总结

ysGQHpTMgWIUSpDwmgCoUJVYRcTCYrOl.png


由此图可得知,使用Rust重构项目后,资源占用相当低,我的小云主机还能再坚持一阵。


Comment