短文本匹配实战：重复POI识别

!文章内容如有错误或排版问题，请提交反馈，非常感谢！

项目需求

去除从地图网站抓取的POI数据中的重复数据。示例数据如下：

思考逻辑

POI去重问题，并非简单的文本匹配，按照编辑距离去做，可能会走到沟里去。

不同地方有相同名字的POI点，如：

行政管理中心
人民桥
中央公园
…

解决方案：通过经纬度限定区域，在限定区域内找出重复POI。初步方案用Geohash的精度做区域限定。

选定的精度是5，如果担心出现边界问题，可采用缩放精度解决。

相同区域内，可能存在两个极为相似的POI点，如：

苏州工业园区第七中学
苏州工业园区第八中学

解决方案：分词后计算TF-IDF，给予每个词不同权重后进行比较。

代码实现

import pandas as pd
import numpy as np
import geohash # pip install python-geohash
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn
import pkuseg
from sqlalchemy import create_engine
from multiprocessing import Pool

engine = create_engine("mysql+pymysql://root:root@localhost/hackathon")


def step_1():
    """数据准备"""
    seg = pkuseg.pkuseg()
    stopwords = [',', '\n', '(', ')', '|', '-', '/']

    def cut_word(content):
        text = seg.cut(str(content))
        new_text = []
        for w in text:
            if w not in stopwords:
                new_text.append(w)
        return "".join(new_text)

    # 加载原始数据
    poi_data = pd.read_csv("data/poi_data.txt", sep='\t',
    names=['id', 'name', 'lon', 'lat', 'hot', 'address', 'city', 'area', 'town'])

    # 删除经纬度错误数据
    poi_data = poi_data[
    (poi_data['lat'] <= 90.0) & (poi_data['lat'] >= -90.0) & (poi_data['lon'] <= 180) & (poi_data['lon'] >= -180)]

    # 新增GEOHASH列，取精度为5，即误差范围为4.9km左右
    poi_data["geohash"] = np.vectorize(geohash.encode)(poi_data['lat'], poi_data['lon'], precision=5)

    # 新增分词后字段
    poi_data["name_cut"] = np.vectorize(cut_word)(poi_data['name'])
    poi_data["address_cut"] = np.vectorize(cut_word)(poi_data['address'])
    poi_data['name_address_cut'] = poi_data['name_cut'] + ' ' + poi_data['address_cut']

    # 保存数据到pkl文件
    poi_data.to_pickle("./poi_data.pkl")


def step_2():
    """相似度计算，并存入数据库"""
    poi_data = pd.read_pickle("./poi_data.pkl")

    def get_matches_df(sparse_matrix, df_name_vactor, top=500):
        name_vector = df_name_vactor['name_address_cut']
        non_zeros = sparse_matrix.nonzero()
        sparserows = non_zeros[0]
        sparsecols = non_zeros[1]
        nr_matches = top if top < sparse_matrix.shape[0] else sparse_matrix.shape[0]

        left_side = np.empty([nr_matches], dtype=object)
        right_side = np.empty([nr_matches], dtype=object)
        left_id = np.empty([nr_matches], dtype=object)
        right_id = np.empty([nr_matches], dtype=object)
        similairity = np.zeros(nr_matches)

        for index in range(0, nr_matches):
            left_side[index] = name_vector[sparserows[index]]
            right_side[index] = name_vector[sparsecols[index]]
            left_id[index] = df_name_vactor.iloc[sparserows[index]]['id']
            right_id[index] = df_name_vactor.iloc[sparsecols[index]]['id']
            similairity[index] = sparse_matrix.data[index]

        return pd.DataFrame({'left_side': left_side,
        'right_side': right_side,
        'left_id': left_id,
        'right_id': right_id,
        'similarity': similairity})

    def pair_insert(ghash):
        poi_area = poi_data[poi_data["geohash"] == ghash].reset_index(drop=True, inplace=True)
        vectorizer = TfidfVectorizer(analyzer='word', sublinear_tf=True)
        # vectorizer = TfidfVectorizer(analyzer='word', token_pattern="(?u)\\b\\w+\\b", sublinear_tf=True)
        tf_idf_matrix = vectorizer.fit_transform(poi_area['name_address_cut'])
        matches = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), 5)
        matches_df = get_matches_df(matches, poi_area)
        matches_df = matches_df[matches_df['similarity'] < 0.99999]
        print(matches_df.shape)
        matches_df['left_id'] = matches_df['left_id'].astype('int')
        matches_df['right_id'] = matches_df['right_id'].astype('int')
        matches_df['similarity'] = matches_df['similarity'].astype('float')
        matches_df.to_sql(con=engine, name='pair_result', if_exists='append')

    pool = Pool(8)
    pool.map(pair_insert, poi_data["geohash"].unique())
    pool.close()
    pool.join()


if __name__ == "__main__":
    step_1()
    step_2()

按区域进行匹配示例：

df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'])

每个POI分词后的TF-IDF：

test = df["name_address_cut"][:2].tolist()
idf_dict = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

for i in range(len(test)):
    print(test[i])
    for word in test[i].split(" "):
        try:
            print(word, idf_dict[word])
        except:
            pass
    print("")