mirror of
https://github.com/ArthurIdema/Zoekeend-Phrase-Indexing.git
synced 2025-10-26 16:24:21 +00:00
137 lines
5.2 KiB
Python
137 lines
5.2 KiB
Python
import duckdb
|
|
from collections import Counter
|
|
|
|
def create_tokenizer_duckdb(con):
|
|
con.sql("""
|
|
CREATE TEMPORARY MACRO tokenize(s) AS (
|
|
string_split_regex(regexp_replace(lower(strip_accents(CAST(s AS VARCHAR))), '[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|''''"`-]+', ' ', 'g'), '\\s+')
|
|
);
|
|
""")
|
|
|
|
def extract_phrases(documents, n=2, min_freq=2, db_path='phrases.db'):
|
|
con = duckdb.connect(database=db_path)
|
|
create_tokenizer_duckdb(con)
|
|
|
|
# Load documents into DuckDB table
|
|
con.execute("CREATE TEMP TABLE docs AS SELECT * FROM (VALUES " +
|
|
",".join(["(?, ?)"] * len(documents)) +
|
|
") AS t(doc_id, text)", [item for pair in documents for item in pair])
|
|
|
|
# Tokenize and flatten tokens in DuckDB
|
|
tokens_df = con.sql("""
|
|
SELECT doc_id, unnest(tokenize(text)) AS token
|
|
FROM docs
|
|
""").df()
|
|
|
|
# Generate n-grams in Python
|
|
token_counter = Counter()
|
|
ngram_counter = Counter()
|
|
|
|
grouped = tokens_df.groupby('doc_id')['token'].apply(list)
|
|
|
|
total_tokens = 0
|
|
for token_list in grouped:
|
|
total_tokens += len(token_list)
|
|
token_counter.update(token_list)
|
|
ngrams = zip(*[token_list[i:] for i in range(n)])
|
|
ngram_counter.update(ngrams)
|
|
|
|
# Extract frequent phrases
|
|
phrases = [" ".join(ngram) for ngram, freq in ngram_counter.items() if freq >= min_freq]
|
|
return phrases
|
|
|
|
def extract_phrases_pmi_duckdb(con, fts_schema, n=2, min_freq=2, min_pmi=3.0):
|
|
# 1. Create a tokenized table
|
|
con.execute(f"""CREATE OR REPLACE TABLE {fts_schema}.tokens AS
|
|
SELECT
|
|
did AS doc_id,
|
|
unnest({fts_schema}.tokenize(content)) AS token
|
|
FROM
|
|
documents;
|
|
|
|
""")
|
|
|
|
print("Tokenized documents:\n", con.execute(f"SELECT * FROM {fts_schema}.tokens LIMIT 10").fetchall())
|
|
|
|
# 2. Add position index for each token in its document
|
|
con.execute(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.tokens_pos AS
|
|
SELECT doc_id, token,
|
|
ROW_NUMBER() OVER (PARTITION BY doc_id ORDER BY rowid) AS pos
|
|
FROM {fts_schema}.tokens
|
|
""")
|
|
|
|
# 3. Compute total token count
|
|
total_tokens = con.execute(f"SELECT COUNT(*)::DOUBLE FROM {fts_schema}.tokens_pos").fetchone()[0]
|
|
|
|
# 4. Compute token frequencies
|
|
con.execute(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.token_freq AS
|
|
SELECT token,
|
|
COUNT(*) AS freq,
|
|
COUNT(DISTINCT doc_id) AS doc_freq
|
|
FROM {fts_schema}.tokens_pos
|
|
GROUP BY token
|
|
""")
|
|
print("Token frequency:\n", con.execute(f"SELECT * FROM {fts_schema}.token_freq LIMIT 10").fetchall())
|
|
|
|
# 5. Compute bigrams (or n-grams)
|
|
con.execute(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.ngrams AS
|
|
SELECT t1.token AS w1, t2.token AS w2,
|
|
t1.doc_id AS doc_id
|
|
FROM {fts_schema}.tokens_pos t1
|
|
JOIN {fts_schema}.tokens_pos t2
|
|
ON t1.doc_id = t2.doc_id AND t2.pos = t1.pos + 1
|
|
""")
|
|
|
|
# 6. Compute n-gram frequencies
|
|
con.execute(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.ngram_freq AS
|
|
SELECT w1, w2, COUNT(*) AS freq,
|
|
COUNT(DISTINCT doc_id) AS doc_freq
|
|
FROM {fts_schema}.ngrams
|
|
GROUP BY w1, w2
|
|
HAVING COUNT(*) >= {min_freq}
|
|
""")
|
|
|
|
print("N-gram frequency:\n", con.execute(f"SELECT * FROM {fts_schema}.ngram_freq LIMIT 10").fetchall())
|
|
print(f"Number of n-grams: {con.execute(f'SELECT COUNT(*) FROM {fts_schema}.ngram_freq').fetchone()[0]}")
|
|
# 7. Compute PMI for bigrams
|
|
con.execute(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.phrases AS
|
|
SELECT w1 || ' ' || w2 AS phrase,
|
|
LOG(n.freq * {total_tokens} / (f1.freq * f2.freq)) / LOG(2) AS pmi,
|
|
n.doc_freq AS df
|
|
FROM {fts_schema}.ngram_freq n
|
|
JOIN {fts_schema}.token_freq f1 ON n.w1 = f1.token
|
|
JOIN {fts_schema}.token_freq f2 ON n.w2 = f2.token
|
|
WHERE LOG(n.freq * {total_tokens} / (f1.freq * f2.freq)) / LOG(2) >= {min_pmi}
|
|
ORDER BY pmi DESC
|
|
""")
|
|
|
|
print("Extracted phrases:\n", con.execute(f"SELECT phrase, pmi, df FROM {fts_schema}.phrases LIMIT 10").fetchall())
|
|
print("Extracted tokens:\n", con.execute(f"SELECT token FROM {fts_schema}.token_freq LIMIT 10").fetchall())
|
|
# 8. Combine phrases and words
|
|
con.execute(f"""
|
|
CREATE OR REPLACE TABLE {fts_schema}.dict AS
|
|
SELECT ROW_NUMBER() OVER () AS termid, phrase as term, df
|
|
FROM {fts_schema}.phrases
|
|
WHERE NOT EXISTS (
|
|
SELECT 1 FROM UNNEST(string_split(phrase, ' ')) AS word
|
|
WHERE word.unnest IN (SELECT sw FROM {fts_schema}.stopwords)
|
|
)
|
|
UNION ALL
|
|
SELECT ROW_NUMBER() OVER () + (SELECT COUNT(*) FROM {fts_schema}.phrases) AS termid, token AS term, doc_freq AS df
|
|
FROM {fts_schema}.token_freq
|
|
WHERE token NOT IN (SELECT sw FROM {fts_schema}.stopwords)
|
|
AND freq >= {min_freq}
|
|
""")
|
|
|
|
print("Phrases:\n", con.execute(f"SELECT term, df FROM {fts_schema}.dict LIMIT 10").fetchall())
|
|
|
|
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.tokens_pos")
|
|
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.token_freq")
|
|
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.ngrams")
|
|
con.execute(f"DROP TABLE IF EXISTS {fts_schema}.ngram_freq")
|