假设有下面这个表,seller_id+buyer_id是需要唯一的,
create table tab(id int, seller_id int, buyer_id int, info varchar(100), ts date, primary key(id));
由于没有加上唯一约束,某些seller_id+buyer_id出现了重复数据,如何找出并清理重复数据?
首先可以按seller_id+buyer_id分组,找出记录数大于1的seller_id+buyer_id组合:
select seller_id, buyer_id, count(*)
from tab
group by seller_id, buyer_id
having(count(*)>1);
然后查出这些seller_id+buyer_id组合下的所有记录
select a.* from tab a, (select seller_id, buyer_id
from tab
group by seller_id, buyer_id
having(count(*)>1) b
where a.seller_id = b.seller_id
and a.buyer_id = b.buyer_id;
如果检查后发现seller_id+buyer_id只需要保留一条记录,如保留id最大的记录,
delete from tab where id in (
select a.id
from tab a, (select seller_id, buyer_id, max(id) max_id
from tab
group by seller_id, buyer_id
having(count(*)>1) b
where a.seller_id = b.seller_id
and a.buyer_id = b.buyer_id
and a.id < b.max_id
) ;
删除掉重复记录后,需要加上唯一索引:
alter table tab add unique key uk_tab_sid_bid(seller_id, buyer_id);
__END__