Skip to content

Benford's law example #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*/node_modules/
*/npm-debug.log
config.js
16 changes: 16 additions & 0 deletions benford/config.sample.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module.exports = {
rethinkdb: {
host: "localhost",
port: 28015,
db: "examples"
},
http: {
port: 3000
},
twitter: {
consumer_key: '',
consumer_secret: '',
access_token: '',
access_token_secret: ''
}
}
75 changes: 75 additions & 0 deletions benford/crawler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
var config = require(__dirname+"/config.js");

var Twit = require('twit')
var T = new Twit({
consumer_key: config.twitter.consumer_key,
consumer_secret: config.twitter.consumer_secret,
access_token: config.twitter.access_token,
access_token_secret: config.twitter.access_token_secret
});

var r = require('rethinkdb');

var data;
var connection;

r.connect({
host: config.rethinkdb.host,
port: config.rethinkdb.port,
db: config.rethinkdb.db
}, function(err, conn) {
if (err) {
throw new Error("Could not open a connection to rethinkdb\n"+err.message)
}

connection = conn;

// Initialize the table with first the database
r.dbCreate(config.rethinkdb.db).run(connection, function(err, result) {
// If the database already exists, we'll get an error here, but we'll just keep going
r.db(config.rethinkdb.db).tableCreate('benford').run(connection, function(err, result) {
// If the table already exists, we'll get an error here, but we'll just keep going

var seeds = [];
for(var i=1; i<10; i++) {
seeds.push({id: ""+i, value: 0}); // Note: We use the digit value as the primary key and save it as a string
}
r.db(config.rethinkdb.db).table('benford').insert(seeds).run(connection, function(err, result) {
// If the database was already initialized, the inserts will not be executed since RethinkDB
// does not allow redundant primary keys (`id`)
listen();
});
});
});
});


// Listen to Twitter's stream and save the significant digits occurrences that we find
function listen() {
// Open the stream
var stream = T.stream('statuses/sample');

stream.on('tweet', function (tweet) {
var words = tweet.text.split(/\s+/); // Split a tweet on white space

var found = false; // Whether the tweet contains number or not
var data = {}; // Keep track of the data to send to the database

for(var i=0; i<words.length; i++) {
if (words[i].match(/^[1-9]/) !== null) { // Check if a word start with a digit
found = true; // We found at least one number

digit = words[i][0];
data[digit] = data[digit] || 0; // If data[digit] is undefined, set it to 0
data[digit]++
}
}
if (found === true) {
for(var digit in data) {
// Update the document by incrementing its value with data[digit]
// Not that we fire the write without expecting an answer
r.db(config.rethinkdb.db).table('benford').get(digit).update({value: r.row("value").add(data[digit])}).run(connection, {noreply: true})
}
}
});
}
11 changes: 11 additions & 0 deletions benford/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "benford"
, "version": "0.0.1"
, "private": true
, "dependencies": {
"express": "4.0.0"
, "socket.io": "1.0.4"
, "rethinkdb": "1.13.0-0"
, "sticky-session": "0.1.0"
}
}
7 changes: 7 additions & 0 deletions benford/public/bootstrap.min.css

Large diffs are not rendered by default.

113 changes: 113 additions & 0 deletions benford/public/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
<html>
<head>
<title>RethinkDB's feed with Socket.IO - Benford's law</title>
<link rel="stylesheet" href="bootstrap.min.css">
<link rel="stylesheet" type="text/css" href="style.css">
</head>

<body>
<div class="container">
<section>
<h1>RethinkDB's feed with Socket.IO</h1>
<h2>Introduction</h2>
<p>
This little example illustrates
<a href="http://en.wikipedia.org/wiki/Benford's_law">Benford's law</a> using
Twitter's streaming API.</br>The number of occurrences of each significant digit
is computed and updated in real time.
</p>
</section>

<section>
<h2>Results</h2>

<table>
<tr>
<th>Digit value</th>
<th>Occurrences</th>
<th>Percentage</th>
<th>Expected</th>
</tr>
<tr>
<td>1</td>
<td id="occurrences_1">Loading...</td>
<td id="percentage_1">Loading...</td>
<td id="expected_1">30.1%</td>
</tr>
<tr>
<td>2</td>
<td id="occurrences_2">Loading...</td>
<td id="percentage_2">Loading...</td>
<td id="expected_2">17.6%</td>
</tr>
<tr>
<td>3</td>
<td id="occurrences_3">Loading...</td>
<td id="percentage_3">Loading...</td>
<td id="expected_3">12.5%</td>
</tr>
<tr>
<td>4</td>
<td id="occurrences_4">Loading...</td>
<td id="percentage_4">Loading...</td>
<td id="expected_4">9.7%</td>
</tr>
<tr>
<td>5</td>
<td id="occurrences_5">Loading...</td>
<td id="percentage_5">Loading...</td>
<td id="expected_5">7.9%</td>
</tr>
<tr>
<td>6</td>
<td id="occurrences_6">Loading...</td>
<td id="percentage_6">Loading...</td>
<td id="expected_6">6.7%</td>
</tr>
<tr>
<td>7</td>
<td id="occurrences_7">Loading...</td>
<td id="percentage_7">Loading...</td>
<td id="expected_7">5.8%</td>
</tr>
<tr>
<td>8</td>
<td id="occurrences_8">Loading...</td>
<td id="percentage_8">Loading...</td>
<td id="expected_8">5.1%</td>
<tr>
<td>9</td>
<td id="occurrences_9">Loading...</td>
<td id="percentage_9">Loading...</td>
<td id="expected_9">4.6%</td>
</tr>
</table>
</section>

<section>
<h2>How it works</h2>
<p>This example is composed of two parts:
<ul>
<li>
A Node.js script listen to Twitter's sample stream, extract the first significant
digits of the numbers and save it in RethinkDB.
</li>
<li>
A Node.js server listen to any database changes, and broadcast it to any
client connected using <a href="http://socket.io">Socket.io</a>
</li>
</ul>
</p>
<p>
You can find the code on GitHub, in the
<a href="https://github.com/rethinkdb/rethinkdb-example-nodejs/tree/master/benford">node.js examples repository</a>
</p>
</section>

<script src="jquery-1.10.2.min.js"></script>
<script src="socket.io-1.0.4.js"></script>
<script src="main.js"></script>

</div>
</body>
</html>
6 changes: 6 additions & 0 deletions benford/public/jquery-1.10.2.min.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions benford/public/jquery-1.10.2.min.map

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions benford/public/main.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
$(function() {
var socket = io();
var total = 0;

socket.on('all', function(alldata) {
for(var digit in alldata) {
total += alldata[digit]
}
for(digit in alldata) {
$("#occurrences_"+digit).html(alldata[digit])
$("#percentage_"+digit).html((alldata[digit]/total*100).toFixed(1)+"%")
}
});
socket.on('update', function(data) {
$("#occurrences_"+data.new_val.id).html(data.new_val.value)
total += data.new_val.value-data.old_val.value
$("#percentage_"+data.new_val.id).html((data.new_val.value/total*100).toFixed(1)+"%")
});
});
3 changes: 3 additions & 0 deletions benford/public/socket.io-1.0.4.js

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions benford/public/style.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
td, th{
margin: 0px;
border: 1px solid #ccc;
padding: 5px 10px;
text-align: right;
}
section{
margin: 0px 40px 20px 40px;
}
h1{
text-align: center;
margin: 30px 0px;
}
63 changes: 63 additions & 0 deletions benford/server.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
var config = require(__dirname+"/config.js");

var express = require('express');
var r = require('rethinkdb');

var cluster = require('cluster');
var numCPUs = require('os').cpus().length;

var sticky = require('sticky-session');


// We do not use directly the `cluster` modules because socket.io won't work
// sticky will use the request's ip such that a client always connect to the same server
sticky(function() {
var app = express();

// Serve static content
app.use(express.static(__dirname + '/public'));

var server = require('http').createServer(app);
var io = require('socket.io')(server);

// Initialize the values for each significant digits with what we have in the database
var alldata = {};
r.connect({}, function(err, connection) {
r.db('examples').table('benford').run(connection, function(err, cursor) {
if (err) throw new Error("Could not retrieve the data from the server. Is `crawler.js` running?")

cursor.each(function(err, row) {
alldata[row.id] = row.value;
});
});
});

// Everytime a client connect to the server, we send him all the data we have
io.on('connection', function(socket) {
socket.emit('all', alldata);
})

// Create a connection to RethinkDB
r.connect({
host: config.rethinkdb.host,
port: config.rethinkdb.port,
db: config.rethinkdb.db
}, function(err, connection) {

// Open a feed to listen to the changes on the database
r.db('examples').table('benford').changes().run(connection, function(err, feed) {

feed.on('data', function(change) {
// Broadcast the change to all the sockets
io.sockets.emit('update', change);

// Update alldata with the new value
alldata[change.new_val.id] = change.new_val.value;
});
});
});

return server;
}).listen(config.http.port, function() {
console.log('Server listenening at port %d', config.http.port)
});