Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add date field check #169

Merged
merged 10 commits into from
Jul 19, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions ambiata-warden.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,16 @@ library
-Wall

cc-options:
-O3 -Wall -Werror

-O3
-Wall
-Werror
-Wbad-function-cast
-Wnested-externs
-Wstrict-prototypes
-Wmissing-prototypes
-Wmissing-declarations
-Waggregate-return

hs-source-dirs:
src gen

Expand Down Expand Up @@ -178,8 +186,9 @@ executable warden-gen
, resourcet == 1.1.*
, semigroups
, temporary == 1.2.*
, transformers >= 0.3 && < 5
, text
, time == 1.5.*
, transformers >= 0.3 && < 5
, unix >= 2.7.1 && < 2.7.3
, vector == 0.10.*

Expand Down Expand Up @@ -215,10 +224,11 @@ test-suite test
, filepath == 1.3.*
, ieee754 == 0.7.*
, lens == 4.9.*
, semigroups
, quickcheck-instances == 0.3.*
, semigroups
, temporary == 1.2.*
, text
, time == 1.5.*
, vector == 0.10.*

test-suite test-io
Expand Down Expand Up @@ -265,6 +275,7 @@ test-suite test-io
, semigroups
, temporary
, text == 1.2.*
, time == 1.5.*
, transformers >= 0.3 && < 5
, unix >= 2.7.1 && < 2.7.3
, vector == 0.10.*
Expand Down Expand Up @@ -311,6 +322,7 @@ benchmark bench
, semigroups
, temporary
, text == 1.2.*
, time == 1.5.*
, transformers >= 0.3 && < 5
, unix >= 2.7.1 && < 2.7.3
, vector == 0.10.*
Expand Down
144 changes: 0 additions & 144 deletions ambiata-warden.lock-7.8.4

This file was deleted.

17 changes: 16 additions & 1 deletion bench/bench.hs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,12 @@ prepareBools = fmap (fmap T.encodeUtf8) . generate' (Deterministic 555) (GenSize
prepareNonBools :: IO [ByteString]
prepareNonBools = fmap (fmap T.encodeUtf8) . generate' (Deterministic 666) (GenSize 100) $ vectorOf 100 renderedNonBool

prepareDates :: IO [ByteString]
prepareDates = generate' (Deterministic 555) (GenSize 100) $ vectorOf 100 renderedDate

prepareNonDates :: IO [ByteString]
prepareNonDates = generate' (Deterministic 666) (GenSize 100) $ vectorOf 100 renderedNonDate

benchABDecode :: FileFormat -> NonEmpty ViewFile -> IO ()
benchABDecode ff vfs =
let sep = Separator . fromIntegral $ ord '|'
Expand Down Expand Up @@ -150,6 +156,9 @@ benchToRow = toRow . Right
benchCheckFieldBool :: [ByteString] -> [Bool]
benchCheckFieldBool = fmap checkFieldBool

benchCheckFieldDate :: [ByteString] -> [Bool]
benchCheckFieldDate = fmap checkFieldDate

main :: IO ()
main = do
withTempDirectory "." "warden-bench-" $ \root ->
Expand All @@ -160,11 +169,17 @@ main = do
, bench "decode/delimited-text/1000" $ nfIO (benchABDecode DelimitedText vfs)
, bench "decode/toRow/100" $ nf benchToRow bss
]
, env ((,,) <$> prepareRow <*> prepareBools <*> prepareNonBools) $ \ ~(rs, bools, nonbools) ->
, env ((,,,,) <$> prepareRow
<*> prepareBools
<*> prepareNonBools
<*> prepareDates
<*> prepareNonDates) $ \ ~(rs, bools, nonbools, dates, nondates) ->
bgroup "field-parsing" $ [
bench "parseField/200" $ nf benchFieldParse rs
, bench "checkFieldBool/boolean/100" $ nf benchCheckFieldBool bools
, bench "checkFieldBool/non-boolean/100" $ nf benchCheckFieldBool nonbools
, bench "checkFieldDate/date/100" $ nf benchCheckFieldDate dates
, bench "checkFieldDate/non-date/100" $ nf benchCheckFieldDate nondates
]
, env prepareFolds $ \ ~(rs, ts, piis, nonPiis, bs100, bs10) ->
bgroup "folds" $ [
Expand Down
66 changes: 64 additions & 2 deletions cbits/field.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#include "field.h"
#include "predicates.h"

bool warden_field_bool(char *buf, size_t n) {
/* Returns TRUE if the buffer we're passed contains a bool, otherwise
* FALSE. */
bool warden_field_bool(const char *buf, size_t n) {
/* little-endian "false" */
static const int64_t false_bits = 0x00000065736c6166;
static const int64_t false_mask = 0x000000ffffffffff;
Expand Down Expand Up @@ -48,7 +50,7 @@ bool warden_field_bool(char *buf, size_t n) {
in scientific notation.

Otherwise returns non_numeric_field. */
numeric_field warden_field_numeric(char *buf, size_t n) {
numeric_field warden_field_numeric(const char *buf, size_t n) {
size_t i = 0;
int preradix_digits = 0; /* digits before the radix point */
int exponent_digits = 0; /* digits in the exponent (scientific notation) */
Expand Down Expand Up @@ -114,3 +116,63 @@ numeric_field warden_field_numeric(char *buf, size_t n) {
/* just cruft on the end after all */
return non_numeric_field;
}


static inline bool is_separator(char c) {
return (c == '-' || c == '/' || c == '.');
}

/* Match a year in the 20xx century, in big-endian date format with or
without separators. */
static inline bool match_ymd(const char *buf, size_t n) {
/* The shortest thing we're willing to call a "date" at this
point is YYYYMMDD. */
if (n < 8) {
return FALSE;
}

/* 0xc0 = 0x80 | 0x40 - if these bits are set, the byte is too

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably want to make that const char * buf unless you intend to modify it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, thanks.

high to be a digit or a separator. */
static const int64_t ymd_mask = 0xc0c0c0c0c0c0ffff;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code assumes little endian. Wondering if it might not be a good idea to make it a compile error if someone compiles on a big endian system.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a good idea, will work out how to do that.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Linux has <endian.h>. In C projects it usually done with an autoconf
TRY_COMPILE test.

On Tue, Jul 19, 2016 at 12:33 PM, Sharif Olorin [email protected]
wrote:

In cbits/field.c
#169 (comment):

@@ -114,3 +116,65 @@ numeric_field warden_field_numeric(char buf, size_t n) {
/
just cruft on the end after all */
return non_numeric_field;
}
+
+
+static inline bool is_separator(char c) {

  • return (c == '-' || c == '/' || c == '.');
    +}

+/* Match a year in the 20xx century, in big-endian date format with or

  • without separators.
  • precondition: n >= 8 */
    +static inline bool match_ymd(char *buf, size_t n) {
  • /* 0xc0 = 0x80 | 0x40 - if these bits are set, the byte is too
  •  high to be a digit or a separator. */
    
  • static const int64_t ymd_mask = 0xc0c0c0c0c0c0ffff;

That is a good idea, will work out how to do that.


You are receiving this because you were mentioned.
Reply to this email directly, view it on GitHub
https://github.com/ambiata/warden/pull/169/files/070395b3258c58ec7eb8af38d5f22ab63a2244d4#r71264567,
or mute the thread
https://github.com/notifications/unsubscribe-auth/ASos8luBFLd3VSdFq8AhapgQeZnfUfOXks5qXDeCgaJpZM4JOh9C
.

/* No 0x80 or 0x40 set anywhere, and the first two bytes must
be "20". */
static const int64_t ymd_bits = 0x0000000000003032;
int64_t *p = (int64_t *) buf;

/* First, we drop everything which doesn't start with '20' and
have eight bytes compatible with a YYYYxMMxDD format. */
if (!(((*p & ymd_mask) == ymd_bits) && is_digit(buf[2]) && is_digit(buf[3]))) {
return FALSE;
}

/* YYYY-MM-DD */
if (is_separator(buf[4])) {
return (n >= 10 &&
is_digit(buf[5]) &&
is_digit(buf[6]) &&
is_separator(buf[7]) &&
is_digit(buf[8]) &&
is_digit(buf[9]));
}

/* YYYYMMDD */
return (is_digit(buf[4]) &&
is_digit(buf[5]) &&
is_digit(buf[6]) &&
is_digit(buf[7]));

}

/* Returns TRUE if the data in the buffer looks like a date, otherwise
FALSE.

Currently checks:

- Fields beginning with big-endian dates.

FIXME: more supported date formats
*/
bool warden_field_datetime(const char *buf, size_t n) {
return match_ymd(buf, n);
}
6 changes: 4 additions & 2 deletions cbits/field.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ typedef enum _numeric_field {
real_field = 2
} numeric_field;

bool warden_field_bool(char *, size_t);
bool warden_field_bool(const char *, size_t);

numeric_field warden_field_numeric(char *, size_t);
numeric_field warden_field_numeric(const char *, size_t);

bool warden_field_datetime(const char *, size_t);

#endif
Loading