文献术语分布矩阵的laravel实现方案

一、现有表结构梳理(适配映射)

现有的表对应功能映射到方案中
books图书books
wordsets词集(术语集)term_sets
wordentries词条(含子集)terms(树形)
sections段落paragraphs
completions对话(也是一种“段落”)paragraphs(统一处理)

关键点:sections 和 completions 都作为 分析单元(paragraph),统一建模。


二、最终数据库设计(Laravel + MySQL 8 + SQLite 分片)

mermaid

graph LR
    A[MySQL 8] --> B[books, wordsets, wordentries]
    A --> C[sections, completions]
    A --> D[book_wordset_summary 汇总表]
    E[SQLite 分片] --> F[明细: section_wordentry / completion_wordentry]

三、MySQL 8 表结构(Laravel Migration)1. books(已有)

php

Schema::create('books', function (Blueprint $table) {
    $table->id();
    $table->string('title');
    $table->unsignedBigInteger('wordset_id')->nullable(); // 一本书一个词集
    $table->timestamps();
});

2. wordsets(词集)

php

Schema::create('wordsets', function (Blueprint $table) {
    $table->id();
    $table->string('name');
    $table->text('description')->nullable();
    $table->timestamps();
});

3. wordentries(树形词条,支持子集)

php

Schema::create('wordentries', function (Blueprint $table) {
    $table->id();
    $table->foreignId('wordset_id')->constrained()->cascadeOnDelete();
    $table->unsignedBigInteger('parent_id')->nullable(); // 子集
    $table->string('name');
    $table->json('aliases')->nullable(); // 别名数组
    $table->enum('type', ['subset', 'term']); // subset=子集, term=词条
    $table->text('description')->nullable();
    $table->timestamps();

    $table->foreign('parent_id')->references('id')->on('wordentries')->nullOnDelete();
    $table->index(['wordset_id', 'parent_id']);
});

4. sections(段落)

php

Schema::create('sections', function (Blueprint $table) {
    $table->id();
    $table->foreignId('book_id')->constrained()->cascadeOnDelete();
    $table->integer('section_index'); // 顺序
    $table->text('content');
    $table->integer('word_count')->default(0);
    $table->timestamps();

    $table->unique(['book_id', 'section_index']);
});

5. completions(对话)

php

Schema::create('completions', function (Blueprint $table) {
    $table->id();
    $table->foreignId('book_id')->constrained()->cascadeOnDelete();
    $table->integer('completion_index'); // 顺序
    $table->text('content');
    $table->integer('word_count')->default(0);
    $table->timestamps();

    $table->unique(['book_id', 'completion_index']);
});

6. 核心:汇总表 book_wordset_summary

php

// database/migrations/2025_10_28_create_book_wordset_summary.php
Schema::create('book_wordset_summary', function (Blueprint $table) {
    $table->unsignedBigInteger('book_id');
    $table->unsignedBigInteger('wordentry_id');
    $table->unsignedBigInteger('wordset_id');

    $table->unsignedInteger('section_count')->default(0);     // 在多少 section 出现
    $table->unsignedInteger('completion_count')->default(0);  // 在多少 completion 出现
    $table->unsignedInteger('total_frequency')->default(0);   // 总频次
    $table->unsignedInteger('first_appear')->nullable();      // 首次出现位置(index)
    $table->unsignedInteger('last_appear')->nullable();       // 末次出现

    $table->primary(['book_id', 'wordentry_id']);
    $table->index('wordset_id');
    $table->index('wordentry_id');
    $table->index(['wordset_id', 'section_count']);
    $table->index(['wordset_id', 'completion_count']);
});

四、SQLite 分片(明细存储)路径

bash

/storage/app/word_matrix/
├── 0000.db  # book_id 0~9999
├── 0001.db  # ...

每分片建表(两个表:section + completion)

sql

-- 每个 .db 文件包含
CREATE TABLE section_word (
    section_id   INTEGER NOT NULL,
    wordentry_id INTEGER NOT NULL,
    frequency    INTEGER NOT NULL DEFAULT 1,
    positions    TEXT,  -- JSON 数组
    PRIMARY KEY (section_id, wordentry_id)
);

CREATE TABLE completion_word (
    completion_id INTEGER NOT NULL,
    wordentry_id  INTEGER NOT NULL,
    frequency     INTEGER NOT NULL DEFAULT 1,
    positions     TEXT,
    PRIMARY KEY (completion_id, wordentry_id)
);

CREATE INDEX idx_word_section ON section_word(wordentry_id);
CREATE INDEX idx_word_completion ON completion_word(wordentry_id);

五、Laravel 分析任务(队列)

php

// app/Jobs/AnalyzeBookWordDistribution.php
class AnalyzeBookWordDistribution implements ShouldQueue
{
    public function __construct(public Book $book) {}

    public function handle()
    {
        $wordset = $this->book->wordset;
        $leafTerms = $wordset->wordentries()->where('type', 'term')->get();

        // 1. 构建 Aho-Corasick
        $ac = new AhoCorasick();
        foreach ($leafTerms as $term) {
            $ac->add($term->name, $term->id);
            foreach ($term->aliases ?? [] as $alias) {
                $ac->add($alias, $term->id);
            }
        }
        $ac->build();

        // 2. 打开 SQLite 分片
        $shard = sprintf("%04d", $this->book->id / 10000);
        $dbPath = storage_path("app/word_matrix/{$shard}.db");
        $sqlite = new \PDO("sqlite:$dbPath");
        $sqlite->exec("PRAGMA journal_mode = WAL; PRAGMA synchronous = NORMAL;");

        $stmtSection = $sqlite->prepare("
            INSERT OR REPLACE INTO section_word 
            VALUES (?, ?, ?, ?)
        ");
        $stmtCompletion = $sqlite->prepare("
            INSERT OR REPLACE INTO completion_word 
            VALUES (?, ?, ?, ?)
        ");

        // 3. 统计汇总
        $stats = []; // wordentry_id => [sec_count, comp_count, freq, first, last]

        // 处理 sections
        foreach ($this->book->sections()->orderBy('section_index')->get() as $sec) {
            $matches = $ac->search($sec->content);
            foreach ($matches as $wid => $pos) {
                $freq = count($pos);
                $stmtSection->execute([$sec->id, $wid, $freq, json_encode($pos)]);

                $stats[$wid]['sec'] = ($stats[$wid]['sec'] ?? 0) + 1;
                $stats[$wid]['freq'] = ($stats[$wid]['freq'] ?? 0) + $freq;
                $stats[$wid]['first'] ??= $sec->section_index;
                $stats[$wid]['last'] = $sec->section_index;
            }
        }

        // 处理 completions
        foreach ($this->book->completions()->orderBy('completion_index')->get() as $comp) {
            $matches = $ac->search($comp->content);
            foreach ($matches as $wid => $pos) {
                $freq = count($pos);
                $stmtCompletion->execute([$comp->id, $wid, $freq, json_encode($pos)]);

                $stats[$wid]['comp'] = ($stats[$wid]['comp'] ?? 0) + 1;
                $stats[$wid]['freq'] = ($stats[$wid]['freq'] ?? 0) + $freq;
                $stats[$wid]['first'] ??= $comp->completion_index + 10000; // 避免冲突
                $stats[$wid]['last'] = $comp->completion_index + 10000;
            }
        }

        // 4. 更新 MySQL 汇总表
        foreach ($stats as $wid => $s) {
            DB::statement("
                INSERT INTO book_wordset_summary 
                (book_id, wordentry_id, wordset_id, section_count, completion_count, total_frequency, first_appear, last_appear)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
                ON DUPLICATE KEY UPDATE
                section_count = VALUES(section_count),
                completion_count = VALUES(completion_count),
                total_frequency = VALUES(total_frequency),
                first_appear = LEAST(COALESCE(first_appear, 999999), VALUES(first_appear)),
                last_appear = GREATEST(COALESCE(last_appear, 0), VALUES(last_appear))
            ", [
                $this->book->id,
                $wid,
                $wordset->id,
                $s['sec'] ?? 0,
                $s['comp'] ?? 0,
                $s['freq'],
                $s['first'],
                $s['last']
            ]);
        }
    }
}

六、查询 API 示例1. 词集在多本书的覆盖率

php

public function coverage($wordsetId)
{
    return DB::table('book_wordset_summary')
        ->join('wordentries', 'wordentry_id', '=', 'wordentries.id')
        ->where('wordset_id', $wordsetId)
        ->whereRaw('(section_count + completion_count) > 0')
        ->selectRaw('
            wordentries.name,
            COUNT(DISTINCT book_id) as book_count,
            AVG(section_count) as avg_sections,
            AVG(completion_count) as avg_completions
        ')
        ->groupBy('wordentry_id', 'wordentries.name')
        ->orderByDesc('book_count')
        ->get();
}

七、自动分片初始化命令

php

// app/Console/Commands/InitWordMatrixShard.php
Artisan::command('wordmatrix:init {shard}', function ($shard) {
    $path = storage_path("app/word_matrix/{$shard}.db");
    if (file_exists($path)) return;

    $sqlite = new PDO("sqlite:$path");
    $sqlite->exec("
        CREATE TABLE section_word (...);
        CREATE TABLE completion_word (...);
        CREATE INDEX ...
    ");
    $this->info("Shard $shard created.");
});

八、性能与存储

项目估算
book_wordset_summary50亿行 → ~300GB
SQLite 分片1000个 × 6GB = 6TB
~6.3TB
服务器1台 MySQL + 1台 NFS/本地盘

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

这个站点使用 Akismet 来减少垃圾评论。了解你的评论数据如何被处理