SlideShare a Scribd company logo
1 of 57
Download to read offline
TEACHING YOUR
MACHINE
TO FIND
FRAUDSTERS

Ian Barber
ianb@php.net
phpir.com
twitter.com/ianbarber
http://joind.in/3429




https://github.com/ianbarber/FindingFraudsters-Talk
5%
           3%
SOME      .1%
SMALL
NUMBERS    8%
99%
ACCURACY
REALLY     REALLY
             LEGITIMATE   FRAUD


EVALUATED
                989         0
LEGITIMATE


EVALUATED
                 10         1
  FRAUD
REALLY     REALLY
             LEGITIMATE   FRAUD



      90%
EVALUATED
LEGITIMATE
          WR ONG989         0



EVALUATED
                 10         1
  FRAUD
ANOMALY DETECTION
30




         22.5
Clicks




          15




          7.5




           0
                Date
SOFTWARE
ARCHITECTURE
                           Alarm

               Detector

                          No Alarm
                Buffer


User Clicks    Landing
    Ad          Page
DETECTOR
              statistics

 Expected
  Clicks
              Threshold    Data Buffer
Sensitivity



               Alarm
average.php
function detect($sen) {
  $window = array(); $i = 0;
  $alarmCount = 0; $dtd = 0;
  $avg = $stddev = 0;
  $fraud = fopen("fraudclicks.csv", 'r');
  while($d = fgetcsv($fraud)) {
    $i++;
    if(count($window) > 7) {
      array_shift($window);
      $avg = array_sum($window) / 7;
      foreach($window as $val) {
        $stddev += pow($val - $average, 2);
      }
      $stddev = sqrt($stddev/7);
0.2




0.15




 0.1




0.05




  0
       1   2   3   4   5   6   7   8   9   10 11 12 13 14 15 16 17 18 19 20
if($d[1] > ($avg + ($sen * $stddev))){
          $alarmCount++;
          if($i > 201) {
            break;
          }
        } else {
          if($i > 201) {
            $dtd++;
          }
        }
      }
      array_push($window, $d[1]);
    }
    return array($alarmCount-1, $dtd);
}
1.6 SENSITIVITY
          30
                18 False Alarms          1 Day To Detect

         22.5
Clicks




          15




          7.5




           0
                                  Date
2.7 SENSITIVITY
          30
                1 False Alarm      18 Days To Detect

         22.5
Clicks




          15




          7.5




           0
                                Date
SICKNESS
AVAILABILITY
function detect($sens) {          sickavail.php
  $i = 0; $alarms = 0; $dtd = 0;
  $window = array(); $avail = array();
  $fraud = fopen("fraudclicks.csv", 'r');
  while($dat = fgetcsv($fraud)) {
    $dow = date("w", strtotime($dat[0]));
    if( count($window) >= 7
        && isset($avail[$dow]) ) {

      $sick = 0;
      foreach($window as $day => $value) {
        $dowavg = array_sum($avail[$day]) /
                  count($avail[$day]);
        $sick += $value / $dowavg;
      }
      $sick /= count($window);
$avlblty = array_sum($avail[$dow]) /
           count($avail[$dow]);
  $est = $sick * $avlblty;

  $fac = fac($dat[1]);
  $p = exp(-$est) * pow($est,$dat[1])
       / $fac; // poisson calc

  if($p < $sens && $dat[1] > $est) {
    $alarms++;
    if($i > 201) { break; }
  } else {
    if($i > 201) { $dtd++; }
  }

} // end if
0.2




0.15




 0.1




0.05




  0
       1   2   3   4   5   6   7   8   9   10
0.011 SENSITIVITY
          30
                1 False Alarm          1 Day To Detect

         22.5
Clicks




          15




          7.5




           0
                                Date
SUPERVISED CLASSIFIERS
classification model
SOFTWARE
ARCHITECTURE
                               Fraud

            Classifier

                             Not Fraud
  User     Transaction
Purchase    Processor


           Transaction
                              Learner
            Database
EVALUATING THE CLASSIFIER

Training Data   Learner      Model




 Test Data
                            Prediction
                Classifier   Accuracy
   Model
20




15




10




5




0
     0   5   10   15   20
20




15




10




5
             ?
0
     0   5       10   15   20
20




15




10




5
             ?
0
     0   5       10   15   20
$docs = array(
 array('fraud' => false, 'price' => 1699,
       'desc'=>'toy ninja', 'ship' => 'US'),
 array('fraud' => false, 'price' => 20000,
       'desc' => 'TV','ship' => 'US'),
 array('fraud' => false, 'price' => 2500,
       'desc' => 'cds', 'ship' => 'US'),
 array('fraud' => true, 'price' => 20000,
       'desc' => 'console', 'ship' => 'CN'),
 array('fraud' => true, 'price' => 5000,
       'desc' => 'books', 'ship' => 'US'),
 array('fraud' => true, 'price' => 15000,
       'desc' => 'ipod', 'ship' => 'CN'),
);
$db   = new XapianWritableDatabase("index",
                Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);

foreach($docs as $key => $doc) {
    $xdoc = new XapianDocument();
    $xdoc->set_data($doc['fraud'] ?
                    "fraud" : "clean");
    $idx->set_document($xdoc);
    $idx->index_text($doc['price'] . ' ' .
         $doc['desc'] . ' ' . $doc['ship']);
    $db->add_document($xdoc, $key);
}
$db = null;
                               frau dknn.php
$test = array(                     testknn.ph
                                              p
   'price' => 10000, 'desc' => 'TV',
   'ship' => 'CN'
);

$db   = new XapianWritableDatabase("index",
         Xapian::DB_CREATE_OR_OPEN);
$idx = new XapianTermGenerator();
$stem = new XapianStem("english");
$idx->set_stemmer($stem);

$xdoc = new XapianDocument();
$idx->set_document($xdoc);
$idx->index_text($test['price'] . ' ' .
      $test['desc'] . ' ' . $test['ship']);
$id = $db->add_document($xdoc);
$enq = new XapianEnquire($db);
$rset = new XapianRSet();
$rset->add_document($id);
$eset = $enq->get_eset(10, $rset);
$terms = array();
$i = $eset->begin();
while ( !$i->equals($eset->end()) ) {
  $terms[] = $i->get_term(); $i->next();
}

$q = new XapianQuery(
         XapianQuery::OP_OR, $terms);
$enq->set_query($q);
$matches = $enq->get_mset(0, 4, $rset);
$i = $matches->begin();
while (!$i->equals($matches->end())) {
  if($i->get_document()->get_docid() != $id)
  {
    $class = $i->get_document()->get_data();
    var_dump($class);
  }
  $i->next();
}
$db->delete_document($id);


$ php testknn.php
string(5) "clean"
string(5) "fraud"
string(5) "fraud"
TRANSACTION
PARAMETERS
function compareEmailToName($name, $email) {
  $name = strtolower($name);
  $email = strtolower($email);
  $parts = explode(" ", $name);
  $pcnt = 0;

  list($user, $dom) = explode("@", $email);
  $user = str_replace(
              array(".", "+"), " ", $user);
  $dom = preg_replace("/..*/", "", $dom);

  similar_text($name, $user, $pcnt);
  if($pcnt > 80) { return 1.0; }
  similar_text($name, $dom, $pcnt);
  if($pcnt > 80) { return 0.8; }
                                 email.php
if(count($parts)) {
       $highest = 0;
       foreach($parts as $part) {
         similar_text($user, $part, $pcnt);
         if($pcnt > 50 && $pcnt > $highest) {
           $highest = $percent;
         }
         similar_text($dom, $part, $pcnt);
         if($pcnt > 50 && $pcnt > $highest) {
            $highest = $percent;
         }
       }
       return (1.7 * ($highest/100)) - 1;
     }

     return -1;
}
$data = array(
  'purchase_value' => 20993,
  'geo_country' => 'DE',
  'previous_orders' => 1,
  'time' => 6,
  'timegap' => 146632,
  'product_category' => 'small_item',
  'delivery_matches_card' => 0,
  'geo_ip_matches_card' => 1,
  'difference_from_last_trans' => 8755,
  'free_shipping' => 0,
  'email_like_name' => 0,
  'free_email_provider' => 0,
  'disposable_email_provider' => 0,
  'quantity' => 2,
  'fraud' => 0);
SUPPORT
VECTOR MACHINES
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
20




15




10




5




0
     0   5   10   15   20
$ apt-get install libsvm-dev
$ apt-get install libsvm-tools

$ yum install libsvm-devel

$ pecl install svm-beta
$ echo extension=svm.so > /etc/php.d/svm.ini
$ php -r '$s = new svm(); $m = $s->train
(array(array(-1, -1), array(1, 1))); echo
$m->predict(array(0, -1));'
-1
$fh = fopen('paydata.csv', 'r');
$output = array();

while($data = fgetcsv($fh)) {
  $output[] = array(
     $data[14] == 1 ? -1 : 1,
     1 => ($data[0]/20000.00) - 1.0, // price
     2 => $data[1] == 'CN' ? 1.0:-1.0,
     3 => $data[1] == 'US' ? 1.0:-1.0,
     4 => $data[5] == 'digital' ? 1.0:-1.0,
     5 => $data[7] == 1 ? 1.0:-1.0, //geo
     6 => $data[6] == 1 ? 1.0:-1.0, // deliv
     12 => $data[9] == 1 ? 1.0:-1.0, // ship
     13 => ($data[13] / 1.5) - 1.0, // qty
  );
}                                learn.php
$svm = new svm();
$model = $svm->train($output,
               array(-1 => 0.65, 1 => 0.5));
$model->save('learn.model');

$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
  $res = $model->predict($test);
  if($test[0] > 0) {
    if($res > 0) { $tp++; }
    else { $fn++; }
  } else {
    if($res > 0) { $fp++; }
    else { $tn++; }
  }
}
// ...snip.. loading test data from
// paytest.csv

$model = new SVMModel('learn.model');

$fp = $tp = $fn = $tn = 0;
foreach($output as $test) {
  $res = $model->predict($test);
  if($test[0] > 0) {
    if($res > 0) { $tp++; }
    else { $fn++; }
  } else {
    if($res > 0) { $fp++; }
    else { $tn++; }
  }
}
                                   test.php
var_dump("True Positive " . $tp);
var_dump("True Negative " . $tn);
var_dump("False Positive " . $fp);
var_dump("False Negative " . $fn);
var_dump("Accuracy " .
        (($tp+$tn)/($tp+$tn+$fp+$fn)));
$ php learn.php
string(18) "True Positive 8316"
string(18) "True Negative 1682"
string(16) "False Positive 2"
string(16) "False Negative 0"
string(15) "Accuracy 0.9998"

$ php test.php
string(17) "True Positive 844"
string(17) "True Negative 155"
string(16) "False Positive 0"
string(16) "False Negative 1"
string(14) "Accuracy 0.999"
training data


  Test         Verify       Update



Automated     Manual        Manual
Time Series           Class Based



   Sensitivity             Model



 False    Days To    False        False
Alarms    Detect    Positives   Negatives
(shogun)
TEACHING YOUR
MACHINE
TO FIND
FRAUDSTERS

http://joind.in/3429

Ian Barber
ianb@php.net
Title Slide - CSI
http://www.flickr.com/photos/39matt/5241862082
Sickness Availability - Chicago Fire Department
http://www.flickr.com/photos/mike_miley/3929146730/
Model Buildings - Ah Ain’t Long For This Whorl
http://www.flickr.com/photos/chadmiller/98014022/
Repeat Customer - McDonald’s Loyalty Card
http://www.flickr.com/photos/fsse-info/3658873057/
Shipping - FedEx Truck
http://www.flickr.com/photos/moto_club4ag/4852235145/
Velocity - Chevrolet Chevelle Dragster
http://www.flickr.com/photos/jns001/2958999006/
GeoIP - Earth Asia Terminator View
http://www.flickr.com/photos/flyingsinger/86898564/
Multiple Items - Boxes
http://www.flickr.com/photos/skrewtape/851672959/

More Related Content

What's hot

Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...Mail.ru Group
 
Introdução ao Perl 6
Introdução ao Perl 6Introdução ao Perl 6
Introdução ao Perl 6garux
 
News of the Symfony2 World
News of the Symfony2 WorldNews of the Symfony2 World
News of the Symfony2 WorldFabien Potencier
 
Advanced modulinos
Advanced modulinosAdvanced modulinos
Advanced modulinosbrian d foy
 
Xlab #1: Advantages of functional programming in Java 8
Xlab #1: Advantages of functional programming in Java 8Xlab #1: Advantages of functional programming in Java 8
Xlab #1: Advantages of functional programming in Java 8XSolve
 
The Magic Of Tie
The Magic Of TieThe Magic Of Tie
The Magic Of Tiebrian d foy
 
Créer une base NoSQL en 1 heure
Créer une base NoSQL en 1 heureCréer une base NoSQL en 1 heure
Créer une base NoSQL en 1 heureAmaury Bouchard
 
Advanced modulinos trial
Advanced modulinos trialAdvanced modulinos trial
Advanced modulinos trialbrian d foy
 
20 modules i haven't yet talked about
20 modules i haven't yet talked about20 modules i haven't yet talked about
20 modules i haven't yet talked aboutTatsuhiko Miyagawa
 
Melhorando sua API com DSLs
Melhorando sua API com DSLsMelhorando sua API com DSLs
Melhorando sua API com DSLsAugusto Pascutti
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoIntroduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoMasahiro Nagano
 
PHP Language Trivia
PHP Language TriviaPHP Language Trivia
PHP Language TriviaNikita Popov
 

What's hot (18)

zinno
zinnozinno
zinno
 
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
 
Introdução ao Perl 6
Introdução ao Perl 6Introdução ao Perl 6
Introdução ao Perl 6
 
News of the Symfony2 World
News of the Symfony2 WorldNews of the Symfony2 World
News of the Symfony2 World
 
C99
C99C99
C99
 
Php 101: PDO
Php 101: PDOPhp 101: PDO
Php 101: PDO
 
Advanced modulinos
Advanced modulinosAdvanced modulinos
Advanced modulinos
 
Xlab #1: Advantages of functional programming in Java 8
Xlab #1: Advantages of functional programming in Java 8Xlab #1: Advantages of functional programming in Java 8
Xlab #1: Advantages of functional programming in Java 8
 
The Magic Of Tie
The Magic Of TieThe Magic Of Tie
The Magic Of Tie
 
C99[2]
C99[2]C99[2]
C99[2]
 
Créer une base NoSQL en 1 heure
Créer une base NoSQL en 1 heureCréer une base NoSQL en 1 heure
Créer une base NoSQL en 1 heure
 
Advanced modulinos trial
Advanced modulinos trialAdvanced modulinos trial
Advanced modulinos trial
 
Cod
CodCod
Cod
 
20 modules i haven't yet talked about
20 modules i haven't yet talked about20 modules i haven't yet talked about
20 modules i haven't yet talked about
 
Melhorando sua API com DSLs
Melhorando sua API com DSLsMelhorando sua API com DSLs
Melhorando sua API com DSLs
 
Perl 6 by example
Perl 6 by examplePerl 6 by example
Perl 6 by example
 
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 TokyoIntroduction to CloudForecast / YAPC::Asia 2010 Tokyo
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
 
PHP Language Trivia
PHP Language TriviaPHP Language Trivia
PHP Language Trivia
 

Viewers also liked

Deloittes 2009 Technology Fast 500™ Ranking
Deloittes 2009 Technology Fast 500™  RankingDeloittes 2009 Technology Fast 500™  Ranking
Deloittes 2009 Technology Fast 500™ Rankinglisaswiftney
 
Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500Seth Greenberg
 
dollar general annual reports 2002
dollar general annual reports 2002dollar general annual reports 2002
dollar general annual reports 2002finance41
 
Deployment Tactics
Deployment TacticsDeployment Tactics
Deployment TacticsIan Barber
 
Arc Sight Info Documents 10 21 2009
Arc Sight Info Documents 10 21 2009Arc Sight Info Documents 10 21 2009
Arc Sight Info Documents 10 21 2009mattdriscoll
 
The Pixel Lab 2015 | Don't lose heart - Sean Coleman
The Pixel Lab 2015 | Don't lose heart - Sean Coleman The Pixel Lab 2015 | Don't lose heart - Sean Coleman
The Pixel Lab 2015 | Don't lose heart - Sean Coleman power to the pixel
 
Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009mattdriscoll
 
Document Classification In PHP - Slight Return
Document Classification In PHP - Slight ReturnDocument Classification In PHP - Slight Return
Document Classification In PHP - Slight ReturnIan Barber
 
ZeroMQ Is The Answer: PHP Tek 11 Version
ZeroMQ Is The Answer: PHP Tek 11 VersionZeroMQ Is The Answer: PHP Tek 11 Version
ZeroMQ Is The Answer: PHP Tek 11 VersionIan Barber
 
Social media & dirigeants du Cac 40 : que disent les conversations ?
Social media & dirigeants du Cac 40 : que disent les conversations ?Social media & dirigeants du Cac 40 : que disent les conversations ?
Social media & dirigeants du Cac 40 : que disent les conversations ?Linkfluence
 
Israel pide un rey
Israel pide un reyIsrael pide un rey
Israel pide un reyCoke Neto
 
Technology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDFTechnology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDFJustin Campbell
 

Viewers also liked (16)

Deloittes 2009 Technology Fast 500™ Ranking
Deloittes 2009 Technology Fast 500™  RankingDeloittes 2009 Technology Fast 500™  Ranking
Deloittes 2009 Technology Fast 500™ Ranking
 
Canada Deber 2pdf
Canada Deber 2pdfCanada Deber 2pdf
Canada Deber 2pdf
 
Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500Deloitte-2014-Technology-Fast500
Deloitte-2014-Technology-Fast500
 
dollar general annual reports 2002
dollar general annual reports 2002dollar general annual reports 2002
dollar general annual reports 2002
 
Deployment Tactics
Deployment TacticsDeployment Tactics
Deployment Tactics
 
20140528 valeant story draft deckv85
20140528 valeant story draft deckv8520140528 valeant story draft deckv85
20140528 valeant story draft deckv85
 
Arc Sight Info Documents 10 21 2009
Arc Sight Info Documents 10 21 2009Arc Sight Info Documents 10 21 2009
Arc Sight Info Documents 10 21 2009
 
The Pixel Lab 2015 | Don't lose heart - Sean Coleman
The Pixel Lab 2015 | Don't lose heart - Sean Coleman The Pixel Lab 2015 | Don't lose heart - Sean Coleman
The Pixel Lab 2015 | Don't lose heart - Sean Coleman
 
Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009Arc Sight Info Documents 12 3 2009
Arc Sight Info Documents 12 3 2009
 
Document Classification In PHP - Slight Return
Document Classification In PHP - Slight ReturnDocument Classification In PHP - Slight Return
Document Classification In PHP - Slight Return
 
ZeroMQ Is The Answer: PHP Tek 11 Version
ZeroMQ Is The Answer: PHP Tek 11 VersionZeroMQ Is The Answer: PHP Tek 11 Version
ZeroMQ Is The Answer: PHP Tek 11 Version
 
Social media & dirigeants du Cac 40 : que disent les conversations ?
Social media & dirigeants du Cac 40 : que disent les conversations ?Social media & dirigeants du Cac 40 : que disent les conversations ?
Social media & dirigeants du Cac 40 : que disent les conversations ?
 
Eca´s probabilidad y estadística Agosto 2012-Enero 2013
Eca´s probabilidad y estadística Agosto 2012-Enero 2013Eca´s probabilidad y estadística Agosto 2012-Enero 2013
Eca´s probabilidad y estadística Agosto 2012-Enero 2013
 
4 de febrero de 1992 pdf
4 de febrero de 1992 pdf4 de febrero de 1992 pdf
4 de febrero de 1992 pdf
 
Israel pide un rey
Israel pide un reyIsrael pide un rey
Israel pide un rey
 
Technology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDFTechnology-Fast-500-Winners-Brochure.PDF
Technology-Fast-500-Winners-Brochure.PDF
 

Similar to Teaching Your Machine to Detect Fraud With Supervised Learning Techniques

Javascript & jQuery: A pragmatic introduction
Javascript & jQuery: A pragmatic introductionJavascript & jQuery: A pragmatic introduction
Javascript & jQuery: A pragmatic introductionIban Martinez
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHPTaras Kalapun
 
Rails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
Rails-like JavaScript Using CoffeeScript, Backbone.js and JasmineRails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
Rails-like JavaScript Using CoffeeScript, Backbone.js and JasmineRaimonds Simanovskis
 
Your code sucks, let's fix it - DPC UnCon
Your code sucks, let's fix it - DPC UnConYour code sucks, let's fix it - DPC UnCon
Your code sucks, let's fix it - DPC UnConRafael Dohms
 
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012Amazon Web Services
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitmfrost503
 
Unit testing with zend framework tek11
Unit testing with zend framework tek11Unit testing with zend framework tek11
Unit testing with zend framework tek11Michelangelo van Dam
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitmfrost503
 
Unit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxUnit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxMichelangelo van Dam
 
Document Classification In PHP
Document Classification In PHPDocument Classification In PHP
Document Classification In PHPIan Barber
 
Gta v savegame
Gta v savegameGta v savegame
Gta v savegamehozayfa999
 
WordPress Realtime - WordCamp São Paulo 2015
WordPress Realtime - WordCamp São Paulo 2015WordPress Realtime - WordCamp São Paulo 2015
WordPress Realtime - WordCamp São Paulo 2015Fernando Daciuk
 
Game Development with SDL and Perl
Game Development with SDL and PerlGame Development with SDL and Perl
Game Development with SDL and Perlgarux
 
R57shell
R57shellR57shell
R57shellady36
 

Similar to Teaching Your Machine to Detect Fraud With Supervised Learning Techniques (20)

Javascript & jQuery: A pragmatic introduction
Javascript & jQuery: A pragmatic introductionJavascript & jQuery: A pragmatic introduction
Javascript & jQuery: A pragmatic introduction
 
Crazy things done on PHP
Crazy things done on PHPCrazy things done on PHP
Crazy things done on PHP
 
Coding website
Coding websiteCoding website
Coding website
 
Rails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
Rails-like JavaScript Using CoffeeScript, Backbone.js and JasmineRails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
Rails-like JavaScript Using CoffeeScript, Backbone.js and Jasmine
 
Your code sucks, let's fix it - DPC UnCon
Your code sucks, let's fix it - DPC UnConYour code sucks, let's fix it - DPC UnCon
Your code sucks, let's fix it - DPC UnCon
 
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
TLS305 Using DynamoDB with the AWS SDK for PHP - AWS re: Invent 2012
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnit
 
My Development Story
My Development StoryMy Development Story
My Development Story
 
Unit testing with zend framework tek11
Unit testing with zend framework tek11Unit testing with zend framework tek11
Unit testing with zend framework tek11
 
Mocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnitMocking Dependencies in PHPUnit
Mocking Dependencies in PHPUnit
 
Mocking Demystified
Mocking DemystifiedMocking Demystified
Mocking Demystified
 
Ns2programs
Ns2programsNs2programs
Ns2programs
 
Unit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBeneluxUnit testing with zend framework PHPBenelux
Unit testing with zend framework PHPBenelux
 
Document Classification In PHP
Document Classification In PHPDocument Classification In PHP
Document Classification In PHP
 
Gta v savegame
Gta v savegameGta v savegame
Gta v savegame
 
WordPress Realtime - WordCamp São Paulo 2015
WordPress Realtime - WordCamp São Paulo 2015WordPress Realtime - WordCamp São Paulo 2015
WordPress Realtime - WordCamp São Paulo 2015
 
Database api
Database apiDatabase api
Database api
 
ddd+scala
ddd+scaladdd+scala
ddd+scala
 
Game Development with SDL and Perl
Game Development with SDL and PerlGame Development with SDL and Perl
Game Development with SDL and Perl
 
R57shell
R57shellR57shell
R57shell
 

Recently uploaded

DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenHervé Boutemy
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Scott Keck-Warren
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machinePadma Pradeep
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Mark Simos
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticscarlostorres15106
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyAlfredo García Lavilla
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embeddingZilliz
 
Search Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfSearch Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfRankYa
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek SchlawackFwdays
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsSergiu Bodiu
 
Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Manik S Magar
 
Connect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationConnect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationSlibray Presentation
 
Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfAlex Barbosa Coqueiro
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr LapshynFwdays
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxNavinnSomaal
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Mattias Andersson
 
My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationRidwan Fadjar
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Commit University
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Patryk Bandurski
 

Recently uploaded (20)

DevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache MavenDevoxxFR 2024 Reproducible Builds with Apache Maven
DevoxxFR 2024 Reproducible Builds with Apache Maven
 
Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024Advanced Test Driven-Development @ php[tek] 2024
Advanced Test Driven-Development @ php[tek] 2024
 
Install Stable Diffusion in windows machine
Install Stable Diffusion in windows machineInstall Stable Diffusion in windows machine
Install Stable Diffusion in windows machine
 
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
Tampa BSides - Chef's Tour of Microsoft Security Adoption Framework (SAF)
 
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmaticsKotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
Kotlin Multiplatform & Compose Multiplatform - Starter kit for pragmatics
 
Commit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easyCommit 2024 - Secret Management made easy
Commit 2024 - Secret Management made easy
 
Training state-of-the-art general text embedding
Training state-of-the-art general text embeddingTraining state-of-the-art general text embedding
Training state-of-the-art general text embedding
 
Search Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdfSearch Engine Optimization SEO PDF for 2024.pdf
Search Engine Optimization SEO PDF for 2024.pdf
 
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
"Subclassing and Composition – A Pythonic Tour of Trade-Offs", Hynek Schlawack
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platforms
 
Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!Anypoint Exchange: It’s Not Just a Repo!
Anypoint Exchange: It’s Not Just a Repo!
 
Connect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck PresentationConnect Wave/ connectwave Pitch Deck Presentation
Connect Wave/ connectwave Pitch Deck Presentation
 
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptxE-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
E-Vehicle_Hacking_by_Parul Sharma_null_owasp.pptx
 
Unraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdfUnraveling Multimodality with Large Language Models.pdf
Unraveling Multimodality with Large Language Models.pdf
 
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
"Federated learning: out of reach no matter how close",Oleksandr Lapshyn
 
SAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptxSAP Build Work Zone - Overview L2-L3.pptx
SAP Build Work Zone - Overview L2-L3.pptx
 
Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?Are Multi-Cloud and Serverless Good or Bad?
Are Multi-Cloud and Serverless Good or Bad?
 
My Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 PresentationMy Hashitalk Indonesia April 2024 Presentation
My Hashitalk Indonesia April 2024 Presentation
 
Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!Nell’iperspazio con Rocket: il Framework Web di Rust!
Nell’iperspazio con Rocket: il Framework Web di Rust!
 
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
Integration and Automation in Practice: CI/CD in Mule Integration and Automat...
 

Teaching Your Machine to Detect Fraud With Supervised Learning Techniques

  • 1. TEACHING YOUR MACHINE TO FIND FRAUDSTERS Ian Barber ianb@php.net phpir.com twitter.com/ianbarber
  • 3. 5% 3% SOME .1% SMALL NUMBERS 8%
  • 5. REALLY REALLY LEGITIMATE FRAUD EVALUATED 989 0 LEGITIMATE EVALUATED 10 1 FRAUD
  • 6. REALLY REALLY LEGITIMATE FRAUD 90% EVALUATED LEGITIMATE WR ONG989 0 EVALUATED 10 1 FRAUD
  • 8. 30 22.5 Clicks 15 7.5 0 Date
  • 9. SOFTWARE ARCHITECTURE Alarm Detector No Alarm Buffer User Clicks Landing Ad Page
  • 10. DETECTOR statistics Expected Clicks Threshold Data Buffer Sensitivity Alarm
  • 11. average.php function detect($sen) { $window = array(); $i = 0; $alarmCount = 0; $dtd = 0; $avg = $stddev = 0; $fraud = fopen("fraudclicks.csv", 'r'); while($d = fgetcsv($fraud)) { $i++; if(count($window) > 7) { array_shift($window); $avg = array_sum($window) / 7; foreach($window as $val) { $stddev += pow($val - $average, 2); } $stddev = sqrt($stddev/7);
  • 12. 0.2 0.15 0.1 0.05 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
  • 13. if($d[1] > ($avg + ($sen * $stddev))){ $alarmCount++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } array_push($window, $d[1]); } return array($alarmCount-1, $dtd); }
  • 14. 1.6 SENSITIVITY 30 18 False Alarms 1 Day To Detect 22.5 Clicks 15 7.5 0 Date
  • 15. 2.7 SENSITIVITY 30 1 False Alarm 18 Days To Detect 22.5 Clicks 15 7.5 0 Date
  • 17. function detect($sens) { sickavail.php $i = 0; $alarms = 0; $dtd = 0; $window = array(); $avail = array(); $fraud = fopen("fraudclicks.csv", 'r'); while($dat = fgetcsv($fraud)) { $dow = date("w", strtotime($dat[0])); if( count($window) >= 7 && isset($avail[$dow]) ) { $sick = 0; foreach($window as $day => $value) { $dowavg = array_sum($avail[$day]) / count($avail[$day]); $sick += $value / $dowavg; } $sick /= count($window);
  • 18. $avlblty = array_sum($avail[$dow]) / count($avail[$dow]); $est = $sick * $avlblty; $fac = fac($dat[1]); $p = exp(-$est) * pow($est,$dat[1]) / $fac; // poisson calc if($p < $sens && $dat[1] > $est) { $alarms++; if($i > 201) { break; } } else { if($i > 201) { $dtd++; } } } // end if
  • 19. 0.2 0.15 0.1 0.05 0 1 2 3 4 5 6 7 8 9 10
  • 20. 0.011 SENSITIVITY 30 1 False Alarm 1 Day To Detect 22.5 Clicks 15 7.5 0 Date
  • 22. classification model SOFTWARE ARCHITECTURE Fraud Classifier Not Fraud User Transaction Purchase Processor Transaction Learner Database
  • 23. EVALUATING THE CLASSIFIER Training Data Learner Model Test Data Prediction Classifier Accuracy Model
  • 24. 20 15 10 5 0 0 5 10 15 20
  • 25. 20 15 10 5 ? 0 0 5 10 15 20
  • 26. 20 15 10 5 ? 0 0 5 10 15 20
  • 27. $docs = array( array('fraud' => false, 'price' => 1699, 'desc'=>'toy ninja', 'ship' => 'US'), array('fraud' => false, 'price' => 20000, 'desc' => 'TV','ship' => 'US'), array('fraud' => false, 'price' => 2500, 'desc' => 'cds', 'ship' => 'US'), array('fraud' => true, 'price' => 20000, 'desc' => 'console', 'ship' => 'CN'), array('fraud' => true, 'price' => 5000, 'desc' => 'books', 'ship' => 'US'), array('fraud' => true, 'price' => 15000, 'desc' => 'ipod', 'ship' => 'CN'), );
  • 28. $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN); $idx = new XapianTermGenerator(); $stem = new XapianStem("english"); $idx->set_stemmer($stem); foreach($docs as $key => $doc) { $xdoc = new XapianDocument(); $xdoc->set_data($doc['fraud'] ? "fraud" : "clean"); $idx->set_document($xdoc); $idx->index_text($doc['price'] . ' ' . $doc['desc'] . ' ' . $doc['ship']); $db->add_document($xdoc, $key); } $db = null; frau dknn.php
  • 29. $test = array( testknn.ph p 'price' => 10000, 'desc' => 'TV', 'ship' => 'CN' ); $db = new XapianWritableDatabase("index", Xapian::DB_CREATE_OR_OPEN); $idx = new XapianTermGenerator(); $stem = new XapianStem("english"); $idx->set_stemmer($stem); $xdoc = new XapianDocument(); $idx->set_document($xdoc); $idx->index_text($test['price'] . ' ' . $test['desc'] . ' ' . $test['ship']); $id = $db->add_document($xdoc);
  • 30. $enq = new XapianEnquire($db); $rset = new XapianRSet(); $rset->add_document($id); $eset = $enq->get_eset(10, $rset); $terms = array(); $i = $eset->begin(); while ( !$i->equals($eset->end()) ) { $terms[] = $i->get_term(); $i->next(); } $q = new XapianQuery( XapianQuery::OP_OR, $terms); $enq->set_query($q); $matches = $enq->get_mset(0, 4, $rset);
  • 31. $i = $matches->begin(); while (!$i->equals($matches->end())) { if($i->get_document()->get_docid() != $id) { $class = $i->get_document()->get_data(); var_dump($class); } $i->next(); } $db->delete_document($id); $ php testknn.php string(5) "clean" string(5) "fraud" string(5) "fraud"
  • 33.
  • 34.
  • 35. function compareEmailToName($name, $email) { $name = strtolower($name); $email = strtolower($email); $parts = explode(" ", $name); $pcnt = 0; list($user, $dom) = explode("@", $email); $user = str_replace( array(".", "+"), " ", $user); $dom = preg_replace("/..*/", "", $dom); similar_text($name, $user, $pcnt); if($pcnt > 80) { return 1.0; } similar_text($name, $dom, $pcnt); if($pcnt > 80) { return 0.8; } email.php
  • 36. if(count($parts)) { $highest = 0; foreach($parts as $part) { similar_text($user, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } similar_text($dom, $part, $pcnt); if($pcnt > 50 && $pcnt > $highest) { $highest = $percent; } } return (1.7 * ($highest/100)) - 1; } return -1; }
  • 37.
  • 38.
  • 39.
  • 40. $data = array( 'purchase_value' => 20993, 'geo_country' => 'DE', 'previous_orders' => 1, 'time' => 6, 'timegap' => 146632, 'product_category' => 'small_item', 'delivery_matches_card' => 0, 'geo_ip_matches_card' => 1, 'difference_from_last_trans' => 8755, 'free_shipping' => 0, 'email_like_name' => 0, 'free_email_provider' => 0, 'disposable_email_provider' => 0, 'quantity' => 2, 'fraud' => 0);
  • 42. 20 15 10 5 0 0 5 10 15 20
  • 43. 20 15 10 5 0 0 5 10 15 20
  • 44. 20 15 10 5 0 0 5 10 15 20
  • 45. 20 15 10 5 0 0 5 10 15 20
  • 46. 20 15 10 5 0 0 5 10 15 20
  • 47. $ apt-get install libsvm-dev $ apt-get install libsvm-tools $ yum install libsvm-devel $ pecl install svm-beta $ echo extension=svm.so > /etc/php.d/svm.ini $ php -r '$s = new svm(); $m = $s->train (array(array(-1, -1), array(1, 1))); echo $m->predict(array(0, -1));' -1
  • 48. $fh = fopen('paydata.csv', 'r'); $output = array(); while($data = fgetcsv($fh)) { $output[] = array( $data[14] == 1 ? -1 : 1, 1 => ($data[0]/20000.00) - 1.0, // price 2 => $data[1] == 'CN' ? 1.0:-1.0, 3 => $data[1] == 'US' ? 1.0:-1.0, 4 => $data[5] == 'digital' ? 1.0:-1.0, 5 => $data[7] == 1 ? 1.0:-1.0, //geo 6 => $data[6] == 1 ? 1.0:-1.0, // deliv 12 => $data[9] == 1 ? 1.0:-1.0, // ship 13 => ($data[13] / 1.5) - 1.0, // qty ); } learn.php
  • 49. $svm = new svm(); $model = $svm->train($output, array(-1 => 0.65, 1 => 0.5)); $model->save('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } }
  • 50. // ...snip.. loading test data from // paytest.csv $model = new SVMModel('learn.model'); $fp = $tp = $fn = $tn = 0; foreach($output as $test) { $res = $model->predict($test); if($test[0] > 0) { if($res > 0) { $tp++; } else { $fn++; } } else { if($res > 0) { $fp++; } else { $tn++; } } } test.php
  • 51. var_dump("True Positive " . $tp); var_dump("True Negative " . $tn); var_dump("False Positive " . $fp); var_dump("False Negative " . $fn); var_dump("Accuracy " . (($tp+$tn)/($tp+$tn+$fp+$fn)));
  • 52. $ php learn.php string(18) "True Positive 8316" string(18) "True Negative 1682" string(16) "False Positive 2" string(16) "False Negative 0" string(15) "Accuracy 0.9998" $ php test.php string(17) "True Positive 844" string(17) "True Negative 155" string(16) "False Positive 0" string(16) "False Negative 1" string(14) "Accuracy 0.999"
  • 53. training data Test Verify Update Automated Manual Manual
  • 54. Time Series Class Based Sensitivity Model False Days To False False Alarms Detect Positives Negatives
  • 57. Title Slide - CSI http://www.flickr.com/photos/39matt/5241862082 Sickness Availability - Chicago Fire Department http://www.flickr.com/photos/mike_miley/3929146730/ Model Buildings - Ah Ain’t Long For This Whorl http://www.flickr.com/photos/chadmiller/98014022/ Repeat Customer - McDonald’s Loyalty Card http://www.flickr.com/photos/fsse-info/3658873057/ Shipping - FedEx Truck http://www.flickr.com/photos/moto_club4ag/4852235145/ Velocity - Chevrolet Chevelle Dragster http://www.flickr.com/photos/jns001/2958999006/ GeoIP - Earth Asia Terminator View http://www.flickr.com/photos/flyingsinger/86898564/ Multiple Items - Boxes http://www.flickr.com/photos/skrewtape/851672959/