!文章内容如有错误或排版问题,请提交反馈,非常感谢!
项目需求
去除从地图网站抓取的POI数据中的重复数据。示例数据如下:
思考逻辑
POI去重问题,并非简单的文本匹配,按照编辑距离去做,可能会走到沟里去。
不同地方有相同名字的POI点,如:
- 行政管理中心
- 人民桥
- 中央公园
- …
解决方案:通过经纬度限定区域,在限定区域内找出重复POI。初步方案用Geohash的精度做区域限定。
选定的精度是5,如果担心出现边界问题,可采用缩放精度解决。
相同区域内,可能存在两个极为相似的POI点,如:
- 苏州工业园区第七中学
- 苏州工业园区第八中学
解决方案:分词后计算TF-IDF,给予每个词不同权重后进行比较。
代码实现
import pandas as pd import numpy as np import geohash # pip install python-geohash from sklearn.feature_extraction.text import TfidfVectorizer from sparse_dot_topn import awesome_cossim_topn import pkuseg from sqlalchemy import create_engine from multiprocessing import Pool engine = create_engine("mysql+pymysql://root:root@localhost/hackathon") def step_1(): """数据准备""" seg = pkuseg.pkuseg() stopwords = [',', '\n', '(', ')', '|', '-', '/'] def cut_word(content): text = seg.cut(str(content)) new_text = [] for w in text: if w not in stopwords: new_text.append(w) return "".join(new_text) # 加载原始数据 poi_data = pd.read_csv("data/poi_data.txt", sep='\t', names=['id', 'name', 'lon', 'lat', 'hot', 'address', 'city', 'area', 'town']) # 删除经纬度错误数据 poi_data = poi_data[ (poi_data['lat'] <= 90.0) & (poi_data['lat'] >= -90.0) & (poi_data['lon'] <= 180) & (poi_data['lon'] >= -180)] # 新增GEOHASH列,取精度为5,即误差范围为4.9km左右 poi_data["geohash"] = np.vectorize(geohash.encode)(poi_data['lat'], poi_data['lon'], precision=5) # 新增分词后字段 poi_data["name_cut"] = np.vectorize(cut_word)(poi_data['name']) poi_data["address_cut"] = np.vectorize(cut_word)(poi_data['address']) poi_data['name_address_cut'] = poi_data['name_cut'] + ' ' + poi_data['address_cut'] # 保存数据到pkl文件 poi_data.to_pickle("./poi_data.pkl") def step_2(): """相似度计算,并存入数据库""" poi_data = pd.read_pickle("./poi_data.pkl") def get_matches_df(sparse_matrix, df_name_vactor, top=500): name_vector = df_name_vactor['name_address_cut'] non_zeros = sparse_matrix.nonzero() sparserows = non_zeros[0] sparsecols = non_zeros[1] nr_matches = top if top < sparse_matrix.shape[0] else sparse_matrix.shape[0] left_side = np.empty([nr_matches], dtype=object) right_side = np.empty([nr_matches], dtype=object) left_id = np.empty([nr_matches], dtype=object) right_id = np.empty([nr_matches], dtype=object) similairity = np.zeros(nr_matches) for index in range(0, nr_matches): left_side[index] = name_vector[sparserows[index]] right_side[index] = name_vector[sparsecols[index]] left_id[index] = df_name_vactor.iloc[sparserows[index]]['id'] right_id[index] = df_name_vactor.iloc[sparsecols[index]]['id'] similairity[index] = sparse_matrix.data[index] return pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'left_id': left_id, 'right_id': right_id, 'similarity': similairity}) def pair_insert(ghash): poi_area = poi_data[poi_data["geohash"] == ghash].reset_index(drop=True, inplace=True) vectorizer = TfidfVectorizer(analyzer='word', sublinear_tf=True) # vectorizer = TfidfVectorizer(analyzer='word', token_pattern="(?u)\\b\\w+\\b", sublinear_tf=True) tf_idf_matrix = vectorizer.fit_transform(poi_area['name_address_cut']) matches = awesome_cossim_topn(tf_idf_matrix, tf_idf_matrix.transpose(), 5) matches_df = get_matches_df(matches, poi_area) matches_df = matches_df[matches_df['similarity'] < 0.99999] print(matches_df.shape) matches_df['left_id'] = matches_df['left_id'].astype('int') matches_df['right_id'] = matches_df['right_id'].astype('int') matches_df['similarity'] = matches_df['similarity'].astype('float') matches_df.to_sql(con=engine, name='pair_result', if_exists='append') pool = Pool(8) pool.map(pair_insert, poi_data["geohash"].unique()) pool.close() pool.join() if __name__ == "__main__": step_1() step_2()
按区域进行匹配示例:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=["idf_weights"]) df_idf.sort_values(by=['idf_weights'])
每个POI分词后的TF-IDF:
test = df["name_address_cut"][:2].tolist() idf_dict = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) for i in range(len(test)): print(test[i]) for word in test[i].split(" "): try: print(word, idf_dict[word]) except: pass print("")
最终结果: