Tuesday, June 26, 2012

Stata Fuzzy match command

* This command checks if two strings match up.  There is a range of criteria by which this match can occur.  It is a potentially useful command when comparing two variables that might have different word orders or spellings such as names but which seem like they may be the same variables.  In the event that you allow some letters to vary, this command creates a variable that keeps track of how many variables are varying.

* Command Written by Francis Smart
cap program drop fuzzy
program define fuzzy

syntax varlist(min=2 max=2 string)     ///
       [, Gen_match(string) Free_letters(integer 0) ///
  Blanks Words_unordered Letters_unordered    ///
  Missed_letter_count(string) ///
  Reuse_letters Exclude(string) ///
  Case ]

local var1 = "`1'"
local var2 = subinstr("`2'",",","",.)

if "`gen_match'"=="" local gen_match = "matched"
if "`missed_letter_count'"=="" local missed_letter_count = "missed_count"
if "`free_letters'"=="" local free_letters = "0"

di _newline as text "Fuzzy matching (`var1' `var2')"
di as text "Generating match indicator variable (`gen_match')"
di as text "Number of free letters is (`free_letters')"

if "`words_unordered'"!="" {
  di as text "Word order does not matter (up to two words)"
  local blanks = "blanks"


if "`exclude'" != "" di "Characters `exclude' ignored"
if "`blanks'" != "" di as text "Blanks dropped"
if "`lunordered'" != "" di as text "Letter order does not matter when searching for match"
if "`reuse_letters'" != "" di as text "Letters may be resused when searching for matches"
if "`case'" != "" di "Case does not matter"
if (0 < `free_letters' | "`letters_unordered'" != "") di "Missed letter count variable created will be (`missed_letter_count')"

cap drop `gen_match'
 if _rc==0 noi di _newline "Matched indicator var: <`gen_match'> replaced"

gen `gen_match'=0 if `var1' != "" & `var2' != ""
  label var `gen_match' "Match indicator variable"

* Create a list of temporary variables
tempvar t_var1 t_var2 longest_word word_length

* Generate temporary variables for holding var1 and var2
qui gen `t_var1' = `var1' if length(`var1')>length(`var2')
qui gen `t_var2' = `var2' if length(`var1')>length(`var2')

* Whichever is the longest word will be the first variable
qui replace `t_var1' = `var2' if length(`var2')>length(`var1')
qui replace `t_var2' = `var1' if length(`var2')>length(`var1')

* di `longest_word_length'

if "`words_unordered'"=="" local loop_over_words = 1
if "`words_unordered'"!="" local loop_over_words = 2

* Generate a variable to indicate how many unmatched letters are in the variable comparison.
  if (0 < `free_letters' | "`letters_unordered'" != "") {
* Calculate how long the longest word is of the entire set of two strings being compared
    gen `word_length' = max(length(`t_var1'),length(`t_var2'))
    egen `longest_word' = max(`word_length')
    local longest_word_length = `longest_word'[1]
* Drop the variable if it already exists
    cap drop `missed_letter_count'
 if _rc==0 noi di "`missed_letter_count' replaced"

    * Create a variable to store the number of missed letters in the variable matchup.
    gen `missed_letter_count'=0
     label var `missed_letter_count' "Number of letters missed in matchup"

    * Remove any blanks from the variables before trying a match.
  if "`blanks'" != "" {
    replace `t_var1' = subinstr(`t_var1', " " , "", .)
    replace `t_var2' = subinstr(`t_var2', " " , "", .)

  if "`case'" != "" {
    replace `t_var1' = lower(`t_var1')
    replace `t_var2' = lower(`t_var2')
cap gen t_var2 = `t_var2'
 replace t_var2 = `t_var2'


* This will loop either once or twice (once if word order matters, twice if not)
if  "`letters_unordered'" == "" qui forv i=1(1)`loop_over_words' {
  ************    Begin Word Match             ************

  * If words unordered is set then on the second loop reverse the word order.
  if `i'==2 replace `t_var2' =  word(`t_var2',2)+word(`t_var2',1)

  * Remove any excluded characters from the variables before trying a match
  * Loop through the list of user supplied excluded characters.
  foreach v in `exclude' {
    noi di "`v'"
    replace `t_var1' = subinstr(`t_var1', "`v'" , "", .)
    replace `t_var2' = subinstr(`t_var2', "`v'" , "", .)
  cap gen t_var2 = `t_var2'

  replace `gen_match'=1 if `t_var1' ==`t_var2' & `gen_match'==0

  ************    End Word Match               ************

  * If there are free letters (# of letters that are allowed to be different)
  ************    Begin Ordered Letters Match  ************
  if "`letters_unordered'" == "" & 0 < real("`free_letters'") {

  * Start the missed letter count at 0
  replace `missed_letter_count' = 0 if `gen_match'==0

  * Loop through all of the lettered places for a number of loops equal to the longest word in either variable.
  forv v = 1(1)`longest_word_length' {
    * Add 1 to the missed letter count if the `v'th letter of both words does not match up
    replace `missed_letter_count' = `missed_letter_count'+1 ///
          if `gen_match'==0  & substr(`t_var1',`v',1) != substr(`t_var2',`v',1)
  replace `gen_match' = 1 if `missed_letter_count' <= `free_letters'
  ************    End Ordered Letters Match    ************

  ************    Begin Unordered Letters Match  ************
  qui if "`letters_unordered'" != "" {
    replace `missed_letter_count' = 0 if `gen_match'==0
    forv v = 1(1)`longest_word_length' {
      gen tempvar_var2_`v' = substr(`t_var2',`v',1)
    forv v = 1(1)`longest_word_length' {
    gen tempvar_match`v' = 0
    * This generates a variable that indicates if letter `v' in var1 is matched with a letter in var2

    gen tempvar_var1_`v' = substr(`t_var1',`v',1)
* This specifies letter `v' position of var1

    gen tempvar_match_place`v' = .
    * This specfies at what place (in terms of var2 letters) var1 letter `v' got matched with var2 letters

    forv vv = 1(1)`longest_word_length' {
      * This checks if any of the unused letters of var1 match var2
      replace tempvar_match`v' = 1 if tempvar_var2_`vv'==tempvar_var1_`v' & tempvar_match_place`v'==.
      replace tempvar_match_place`v' = `vv' if tempvar_var2_`vv'==tempvar_var1_`v' & tempvar_match_place`v'==.
      * If any of them match they are eleminated by replacing them with the value "ZZ" which cannot be equal to any of the individual letters of using_text.
      replace tempvar_var2_`vv'="ZZ" if tempvar_match`v' == 1 & "`reuse_letters'"=="" & tempvar_match_place`v'==`vv'
    * Add a counter to mismatched letters if none of the letters in the using match with the current mismatch
replace `missed_letter_count' = `missed_letter_count'+1 if `gen_match'==0 & tempvar_match`v'==0
    * Drop tempvar of match and using
cap drop tempvar*

  replace `gen_match' = 1 if `missed_letter_count' <= `free_letters'
  ************    End Unordered Letters Match  ************

 tab `gen_match'
 cap confirm variable `missed_letter_count'
   if !_rc tab `missed_letter_count'



set obs 1000

* Without specifying
gen v0= "John Smith"
gen v1="Smith John"

* This simply creates a variable match=1 if v1=v2
fuzzy v0 v1

* This creates a variable match=1 if v1=v2 with or without word order swtiched.
fuzzy v0 v1, w

gen v2="Smith        John"

* This creates a variable match=1 if v1=v2 with or without word order swtiched.
fuzzy v1 v2, b

* Note, that turning on off word order tells fuzzy that blanks don't matter.

gen v3="Smi.th   ///     J.,.o,h...n ZZZZZZ"

* Make sure to seperate characters to exclude with spaces.
fuzzy v1 v3, b e(. , / Z)

gen obsid = _n

gen v4= v0 + string(obsid)

* This code will tell fuzzy match to check if the strings are similar with up to two letters wild
fuzzy v0 v4, f(2) b

fuzzy v0 v4, f(3) b

* L tells stata to ignore letter order when searching for a match
gen v5="Jist mhohn"
fuzzy v0 v5, f(0) l b

* This failed because Stata is case sensitive and the s in Jist does not match the S in Smith.
* But you can turn off case sensitivity with case (c)
fuzzy v0 v5, f(0) l b c

* Finally we might want to allow letters to be resused when attempting matching
gen v6="John Smith John"

fuzzy v0 v6 , f(1) r l

1 comment:

  1. Hi Francis,

    Nice code for a tricky problem, but I believe the same can be done much faster with levenshtein.ado.