From 43a421237187d75810148eaf34af09be96cf7d7f Mon Sep 17 00:00:00 2001 From: schuemie Date: Sat, 7 Jun 2014 10:05:27 -0400 Subject: [PATCH] Fixed scan code to run on all supported DBMSs. --- src/org/ohdsi/utilities/files/Row.java | 7 + .../ohdsi/whiteRabbit/WhiteRabbitMain.java | 208 +++++++++--------- .../fakeDataGenerator/FakeDataGenerator.java | 83 ++++--- .../whiteRabbit/scan/SourceDataScan.java | 108 ++++----- 4 files changed, 220 insertions(+), 186 deletions(-) diff --git a/src/org/ohdsi/utilities/files/Row.java b/src/org/ohdsi/utilities/files/Row.java index 7d3a0d81..4de5e08f 100644 --- a/src/org/ohdsi/utilities/files/Row.java +++ b/src/org/ohdsi/utilities/files/Row.java @@ -145,4 +145,11 @@ public void remove(String field) { else tempMap.put(entry.getKey(), entry.getValue()); } + + public void upperCaseFieldNames() { + Map tempMap = new HashMap(); + for (Map.Entry entry : fieldName2ColumnIndex.entrySet()) + tempMap.put(entry.getKey().toUpperCase(), entry.getValue()); + fieldName2ColumnIndex = tempMap; + } } diff --git a/src/org/ohdsi/whiteRabbit/WhiteRabbitMain.java b/src/org/ohdsi/whiteRabbit/WhiteRabbitMain.java index 737981e1..2d2be606 100644 --- a/src/org/ohdsi/whiteRabbit/WhiteRabbitMain.java +++ b/src/org/ohdsi/whiteRabbit/WhiteRabbitMain.java @@ -73,11 +73,11 @@ import org.ohdsi.whiteRabbit.scan.SourceDataScan; public class WhiteRabbitMain { - + private JFrame frame; private JTextField folderField; private JTextField scanReportFileField; - + private JComboBox scanRowCount; private JCheckBox scanValueScan; private JSpinner scanMinCellCount; @@ -99,59 +99,59 @@ public class WhiteRabbitMain { private List tables = new ArrayList(); private boolean sourceIsFiles = true; private boolean targetIsFiles = false; - + private List componentsToDisableWhenRunning = new ArrayList(); - + public static void main(String[] args) { new WhiteRabbitMain(args); } - + public WhiteRabbitMain(String[] args) { frame = new JFrame("White Rabbit"); - + frame.addWindowListener(new WindowAdapter() { public void windowClosing(WindowEvent e) { System.exit(0); } }); frame.setLayout(new BorderLayout()); - + JComponent tabsPanel = createTabsPanel(); JComponent consolePanel = createConsolePanel(); - + frame.add(consolePanel, BorderLayout.CENTER); frame.add(tabsPanel, BorderLayout.NORTH); - + loadIcons(frame); frame.pack(); frame.setVisible(true); ObjectExchange.frame = frame; executeParameters(args); } - + private JComponent createTabsPanel() { JTabbedPane tabbedPane = new JTabbedPane(); - + JPanel locationPanel = createLocationsPanel(); tabbedPane.addTab("Locations", null, locationPanel, "Specify the location of the source data and the working folder"); - + JPanel scanPanel = createScanPanel(); tabbedPane.addTab("Scan", null, scanPanel, "Create a scan of the source data"); - + JPanel fakeDataPanel = createFakeDataPanel(); tabbedPane.addTab("Fake data generation", null, fakeDataPanel, "Create fake data based on a scan report for development purposes"); - + return tabbedPane; } - + private JPanel createLocationsPanel() { JPanel panel = new JPanel(); - + panel.setLayout(new GridBagLayout()); GridBagConstraints c = new GridBagConstraints(); c.fill = GridBagConstraints.BOTH; c.weightx = 0.5; - + JPanel folderPanel = new JPanel(); folderPanel.setLayout(new BoxLayout(folderPanel, BoxLayout.X_AXIS)); folderPanel.setBorder(BorderFactory.createTitledBorder("Working folder")); @@ -172,7 +172,7 @@ public void actionPerformed(ActionEvent e) { c.gridy = 0; c.gridwidth = 1; panel.add(folderPanel, c); - + JPanel sourcePanel = new JPanel(); sourcePanel.setLayout(new GridLayout(0, 2)); sourcePanel.setBorder(BorderFactory.createTitledBorder("Source data location")); @@ -180,7 +180,7 @@ public void actionPerformed(ActionEvent e) { sourceType = new JComboBox(new String[] { "Delimited text files", "MySQL", "Oracle", "SQL Server", "PostgreSQL" }); sourceType.setToolTipText("Select the type of source data available"); sourceType.addItemListener(new ItemListener() { - + @Override public void itemStateChanged(ItemEvent arg0) { sourceIsFiles = arg0.getItem().toString().equals("Delimited text files"); @@ -190,7 +190,7 @@ public void itemStateChanged(ItemEvent arg0) { sourceDatabaseField.setEnabled(!sourceIsFiles); sourceDelimiterField.setEnabled(sourceIsFiles); addAllButton.setEnabled(!sourceIsFiles); - + if (!sourceIsFiles && arg0.getItem().toString().equals("Oracle")) { sourceServerField .setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '/', ':/', '/', or ':/'"); @@ -216,7 +216,7 @@ public void itemStateChanged(ItemEvent arg0) { } }); sourcePanel.add(sourceType); - + sourcePanel.add(new JLabel("Server location")); sourceServerField = new JTextField("127.0.0.1"); sourceServerField.setEnabled(false); @@ -233,21 +233,21 @@ public void itemStateChanged(ItemEvent arg0) { sourceDatabaseField = new JTextField(""); sourceDatabaseField.setEnabled(false); sourcePanel.add(sourceDatabaseField); - + sourcePanel.add(new JLabel("Delimiter")); sourceDelimiterField = new JTextField(","); sourceDelimiterField.setToolTipText("The delimiter that separates values. Enter 'tab' for tab."); sourcePanel.add(sourceDelimiterField); - + c.gridx = 0; c.gridy = 1; c.gridwidth = 1; panel.add(sourcePanel, c); - + JPanel testConnectionButtonPanel = new JPanel(); testConnectionButtonPanel.setLayout(new BoxLayout(testConnectionButtonPanel, BoxLayout.X_AXIS)); testConnectionButtonPanel.add(Box.createHorizontalGlue()); - + JButton testConnectionButton = new JButton("Test connection"); testConnectionButton.setBackground(new Color(151, 220, 141)); testConnectionButton.setToolTipText("Test the connection"); @@ -258,26 +258,26 @@ public void actionPerformed(ActionEvent e) { }); componentsToDisableWhenRunning.add(testConnectionButton); testConnectionButtonPanel.add(testConnectionButton); - + c.gridx = 0; c.gridy = 2; c.gridwidth = 1; panel.add(testConnectionButtonPanel, c); - + return panel; } - + private JPanel createScanPanel() { JPanel panel = new JPanel(); panel.setLayout(new BorderLayout()); - + JPanel tablePanel = new JPanel(); tablePanel.setLayout(new BorderLayout()); tablePanel.setBorder(new TitledBorder("Tables to scan")); tableList = new JList(); tableList.setToolTipText("Specify the tables (or CSV files) to be scanned here"); tablePanel.add(new JScrollPane(tableList), BorderLayout.CENTER); - + JPanel tableButtonPanel = new JPanel(); tableButtonPanel.setLayout(new GridLayout(3, 1)); addAllButton = new JButton("Add all in DB"); @@ -306,19 +306,19 @@ public void actionPerformed(ActionEvent e) { }); tableButtonPanel.add(removeButton); tablePanel.add(tableButtonPanel, BorderLayout.EAST); - + panel.add(tablePanel, BorderLayout.CENTER); - + JPanel southPanel = new JPanel(); southPanel.setLayout(new BoxLayout(southPanel, BoxLayout.Y_AXIS)); - + JPanel scanOptionsPanel = new JPanel(); scanOptionsPanel.setLayout(new BoxLayout(scanOptionsPanel, BoxLayout.X_AXIS)); - + scanValueScan = new JCheckBox("Scan field values", true); scanValueScan.setToolTipText("Include a frequency count of field values in the scan report"); scanValueScan.addChangeListener(new ChangeListener() { - + @Override public void stateChanged(ChangeEvent arg0) { scanMinCellCount.setEnabled(((JCheckBox) arg0.getSource()).isSelected()); @@ -327,28 +327,28 @@ public void stateChanged(ChangeEvent arg0) { }); scanOptionsPanel.add(scanValueScan); scanOptionsPanel.add(Box.createHorizontalGlue()); - + scanOptionsPanel.add(new JLabel("Min cell count ")); scanMinCellCount = new JSpinner(); scanMinCellCount.setValue(25); scanMinCellCount.setToolTipText("Minimum frequency for a field value to be included in the report"); scanOptionsPanel.add(scanMinCellCount); scanOptionsPanel.add(Box.createHorizontalGlue()); - + scanOptionsPanel.add(new JLabel("Rows per table ")); scanRowCount = new JComboBox(new String[] { "100,000", "1 million", "all" }); scanRowCount.setSelectedIndex(1); scanRowCount.setToolTipText("Maximum number of rows per table to be scanned for field values"); scanOptionsPanel.add(scanRowCount); - + southPanel.add(scanOptionsPanel); - + southPanel.add(Box.createVerticalStrut(3)); - + JPanel scanButtonPanel = new JPanel(); scanButtonPanel.setLayout(new BoxLayout(scanButtonPanel, BoxLayout.X_AXIS)); scanButtonPanel.add(Box.createHorizontalGlue()); - + JButton scanButton = new JButton("Scan tables"); scanButton.setBackground(new Color(151, 220, 141)); scanButton.setToolTipText("Scan the selected tables"); @@ -360,20 +360,20 @@ public void actionPerformed(ActionEvent e) { componentsToDisableWhenRunning.add(scanButton); scanButtonPanel.add(scanButton); southPanel.add(scanButtonPanel); - + panel.add(southPanel, BorderLayout.SOUTH); - + return panel; } - + private JPanel createFakeDataPanel() { JPanel panel = new JPanel(); - + panel.setLayout(new GridBagLayout()); GridBagConstraints c = new GridBagConstraints(); c.fill = GridBagConstraints.BOTH; c.weightx = 0.5; - + JPanel folderPanel = new JPanel(); folderPanel.setLayout(new BoxLayout(folderPanel, BoxLayout.X_AXIS)); folderPanel.setBorder(BorderFactory.createTitledBorder("Scan report file")); @@ -394,7 +394,7 @@ public void actionPerformed(ActionEvent e) { c.gridy = 0; c.gridwidth = 1; panel.add(folderPanel, c); - + JPanel targetPanel = new JPanel(); targetPanel.setLayout(new GridLayout(0, 2)); targetPanel.setBorder(BorderFactory.createTitledBorder("Target data location")); @@ -403,7 +403,7 @@ public void actionPerformed(ActionEvent e) { targetType = new JComboBox(new String[] { "MySQL" }); targetType.setToolTipText("Select the type of source data available"); targetType.addItemListener(new ItemListener() { - + @Override public void itemStateChanged(ItemEvent arg0) { targetIsFiles = arg0.getItem().toString().equals("Delimited text files"); @@ -412,7 +412,7 @@ public void itemStateChanged(ItemEvent arg0) { targetPasswordField.setEnabled(!targetIsFiles); targetDatabaseField.setEnabled(!targetIsFiles); targetDelimiterField.setEnabled(targetIsFiles); - + if (!targetIsFiles && arg0.getItem().toString().equals("Oracle")) { targetServerField .setToolTipText("For Oracle servers this field contains the SID, servicename, and optionally the port: '/', ':/', '/', or ':/'"); @@ -438,7 +438,7 @@ public void itemStateChanged(ItemEvent arg0) { } }); targetPanel.add(targetType); - + targetPanel.add(new JLabel("Server location")); targetServerField = new JTextField("127.0.0.1"); targetPanel.add(targetServerField); @@ -451,27 +451,27 @@ public void itemStateChanged(ItemEvent arg0) { targetPanel.add(new JLabel("Database name")); targetDatabaseField = new JTextField(""); targetPanel.add(targetDatabaseField); - + targetPanel.add(new JLabel("Delimiter")); targetDelimiterField = new JTextField(","); targetDelimiterField.setToolTipText("The delimiter that separates values. Enter 'tab' for tab."); targetDelimiterField.setEnabled(false); targetPanel.add(targetDelimiterField); - + c.gridx = 0; c.gridy = 1; c.gridwidth = 1; panel.add(targetPanel, c); - + JPanel fakeDataButtonPanel = new JPanel(); fakeDataButtonPanel.setLayout(new BoxLayout(fakeDataButtonPanel, BoxLayout.X_AXIS)); - + fakeDataButtonPanel.add(new JLabel("Max rows per table")); generateRowCount = new JSpinner(); generateRowCount.setValue(10000); fakeDataButtonPanel.add(generateRowCount); fakeDataButtonPanel.add(Box.createHorizontalGlue()); - + JButton testConnectionButton = new JButton("Test connection"); testConnectionButton.setBackground(new Color(151, 220, 141)); testConnectionButton.setToolTipText("Test the connection"); @@ -482,7 +482,7 @@ public void actionPerformed(ActionEvent e) { }); componentsToDisableWhenRunning.add(testConnectionButton); fakeDataButtonPanel.add(testConnectionButton); - + JButton fakeDataButton = new JButton("Generate fake data"); fakeDataButton.setBackground(new Color(151, 220, 141)); fakeDataButton.setToolTipText("Generate fake data based on the scan report"); @@ -493,15 +493,15 @@ public void actionPerformed(ActionEvent e) { }); componentsToDisableWhenRunning.add(fakeDataButton); fakeDataButtonPanel.add(fakeDataButton); - + c.gridx = 0; c.gridy = 2; c.gridwidth = 1; panel.add(fakeDataButtonPanel, c); - + return panel; } - + private JComponent createConsolePanel() { JTextArea consoleArea = new JTextArea(); consoleArea.setToolTipText("General progress information"); @@ -518,7 +518,7 @@ private JComponent createConsolePanel() { ObjectExchange.console = console; return consoleScrollPane; } - + private void loadIcons(JFrame f) { List icons = new ArrayList(); icons.add(loadIcon("WhiteRabbit16.png", f)); @@ -529,7 +529,7 @@ private void loadIcons(JFrame f) { icons.add(loadIcon("WhiteRabbit256.png", f)); f.setIconImages(icons); } - + private Image loadIcon(String name, JFrame f) { Image icon = Toolkit.getDefaultToolkit().getImage(WhiteRabbitMain.class.getResource(name)); MediaTracker mediaTracker = new MediaTracker(f); @@ -542,7 +542,7 @@ private Image loadIcon(String name, JFrame f) { } return null; } - + private void executeParameters(String[] args) { String mode = null; for (String arg : args) { @@ -575,7 +575,7 @@ private void executeParameters(String[] args) { } } } - + private void pickFolder() { JFileChooser fileChooser = new JFileChooser(new File(folderField.getText())); fileChooser.setFileSelectionMode(JFileChooser.DIRECTORIES_ONLY); @@ -583,7 +583,7 @@ private void pickFolder() { if (returnVal == JFileChooser.APPROVE_OPTION) folderField.setText(fileChooser.getSelectedFile().getAbsolutePath()); } - + private void pickScanReportFile() { JFileChooser fileChooser = new JFileChooser(new File(folderField.getText())); fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY); @@ -591,14 +591,14 @@ private void pickScanReportFile() { if (returnVal == JFileChooser.APPROVE_OPTION) scanReportFileField.setText(fileChooser.getSelectedFile().getAbsolutePath()); } - + private void removeTables() { for (Object item : tableList.getSelectedValues()) { tables.remove(item); tableList.setListData(tables.toArray()); } } - + private void addAllTables() { DbSettings sourceDbSettings = getSourceDbSettings(); if (sourceDbSettings != null) { @@ -612,7 +612,7 @@ private void addAllTables() { connection.close(); } } - + private void pickTables() { DbSettings sourceDbSettings = getSourceDbSettings(); if (sourceDbSettings != null) { @@ -622,7 +622,7 @@ private void pickTables() { fileChooser.setFileSelectionMode(JFileChooser.FILES_ONLY); FileNameExtensionFilter filter = new FileNameExtensionFilter("Delimited text files", "csv", "txt"); fileChooser.setFileFilter(filter); - + int returnVal = fileChooser.showDialog(frame, "Select tables"); if (returnVal == JFileChooser.APPROVE_OPTION) { for (File table : fileChooser.getSelectedFiles()) { @@ -631,7 +631,7 @@ private void pickTables() { tables.add(tableName); tableList.setListData(tables.toArray()); } - + } } else if (sourceDbSettings.dataType == DbSettings.DATABASE) { RichConnection connection = new RichConnection(sourceDbSettings.server, sourceDbSettings.domain, sourceDbSettings.user, @@ -649,7 +649,7 @@ private void pickTables() { } } } - + private DbSettings getSourceDbSettings() { DbSettings dbSettings = new DbSettings(); if (sourceType.getSelectedItem().equals("Delimited text files")) { @@ -688,7 +688,7 @@ else if (sourceType.getSelectedItem().toString().equals("SQL Server")) { } return dbSettings; } - + private void testConnection(DbSettings dbSettings) { if (dbSettings.dataType == DbSettings.CSVFILES) { if (new File(folderField.getText()).exists()) { @@ -709,7 +709,7 @@ private void testConnection(DbSettings dbSettings) { JOptionPane.ERROR_MESSAGE); return; } - + RichConnection connection; try { connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType); @@ -718,7 +718,7 @@ private void testConnection(DbSettings dbSettings) { JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Error connecting to server", JOptionPane.ERROR_MESSAGE); return; } - + try { connection.getTableNames(dbSettings.database); } catch (Exception e) { @@ -726,14 +726,14 @@ private void testConnection(DbSettings dbSettings) { JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Error connecting to server", JOptionPane.ERROR_MESSAGE); return; } - + connection.close(); String message = "Succesfully connected to " + dbSettings.database + " on server " + dbSettings.server; JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Connection succesful", JOptionPane.INFORMATION_MESSAGE); - + } } - + private DbSettings getTargetDbSettings() { DbSettings dbSettings = new DbSettings(); dbSettings.dataType = DbSettings.DATABASE; @@ -757,16 +757,16 @@ else if (sourceType.getSelectedItem().toString().equals("SQL Server")) { } } } - + if (dbSettings.database.trim().length() == 0) { String message = "Please specify a name for the target database"; JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Database error", JOptionPane.ERROR_MESSAGE); return null; } - + return dbSettings; } - + private void scanRun() { if (tables.size() == 0) { if (sourceIsFiles) { @@ -790,34 +790,34 @@ else if (scanRowCount.getSelectedItem().toString().equals("1 million")) rowCount = 1000000; if (scanRowCount.getSelectedItem().toString().equals("all")) rowCount = -1; - + ScanThread scanThread = new ScanThread(rowCount, scanValueScan.isSelected(), Integer.parseInt(scanMinCellCount.getValue().toString())); scanThread.start(); } - + private void fakeDataRun() { String filename = scanReportFileField.getText(); if (!new File(filename).exists()) { String message = "File " + filename + " not found"; JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "File not found", JOptionPane.ERROR_MESSAGE); } else { - FakeDataThread thread = new FakeDataThread(Integer.parseInt(scanMinCellCount.getValue().toString()), filename); + FakeDataThread thread = new FakeDataThread(Integer.parseInt(generateRowCount.getValue().toString()), filename); thread.start(); } } - + private class ScanThread extends Thread { - + private int maxRows; private boolean scanValues; private int minCellCount; - + public ScanThread(int maxRows, boolean scanValues, int minCellCount) { this.maxRows = maxRows; this.scanValues = scanValues; this.minCellCount = minCellCount; } - + public void run() { for (JComponent component : componentsToDisableWhenRunning) component.setEnabled(false); @@ -839,18 +839,18 @@ public void run() { component.setEnabled(true); } } - + } - + private class FakeDataThread extends Thread { private int maxRowCount; private String filename; - + public FakeDataThread(int maxRowCount, String filename) { this.maxRowCount = maxRowCount; this.filename = filename; } - + public void run() { for (JComponent component : componentsToDisableWhenRunning) component.setEnabled(false); @@ -865,37 +865,37 @@ public void run() { for (JComponent component : componentsToDisableWhenRunning) component.setEnabled(true); } - + } } - + private class DBTableSelectionDialog extends JDialog implements ActionListener { private static final long serialVersionUID = 4527207331482143091L; private JButton yesButton = null; private JButton noButton = null; private boolean answer = false; private JList list; - + public boolean getAnswer() { return answer; } - + public DBTableSelectionDialog(JFrame frame, boolean modal, String tableNames) { super(frame, modal); - + setTitle("Select tables"); JPanel panel = new JPanel(); panel.setPreferredSize(new Dimension(800, 500)); getContentPane().add(panel); panel.setLayout(new BorderLayout()); - + JLabel message = new JLabel("Select tables"); panel.add(message, BorderLayout.NORTH); - + list = new JList(tableNames.split("\t")); JScrollPane scrollPane = new JScrollPane(list); panel.add(scrollPane, BorderLayout.CENTER); - + JPanel buttonPanel = new JPanel(); yesButton = new JButton("Select tables"); yesButton.addActionListener(this); @@ -904,12 +904,12 @@ public DBTableSelectionDialog(JFrame frame, boolean modal, String tableNames) { noButton.addActionListener(this); buttonPanel.add(noButton); panel.add(buttonPanel, BorderLayout.SOUTH); - + pack(); setLocationRelativeTo(frame); setVisible(true); } - + public void actionPerformed(ActionEvent e) { if (yesButton == e.getSource()) { answer = true; @@ -919,13 +919,13 @@ public void actionPerformed(ActionEvent e) { setVisible(false); } } - + public Object[] getSelectedItems() { return list.getSelectedValues(); } - + } - + private void handleError(Exception e) { System.err.println("Error: " + e.getMessage()); String errorReportFilename = ErrorReport.generate(folderField.getText(), e); @@ -934,5 +934,5 @@ private void handleError(Exception e) { System.out.println(message); JOptionPane.showMessageDialog(frame, StringUtilities.wordWrap(message, 80), "Error", JOptionPane.ERROR_MESSAGE); } - + } diff --git a/src/org/ohdsi/whiteRabbit/fakeDataGenerator/FakeDataGenerator.java b/src/org/ohdsi/whiteRabbit/fakeDataGenerator/FakeDataGenerator.java index d42b3ec4..be8c1dfd 100644 --- a/src/org/ohdsi/whiteRabbit/fakeDataGenerator/FakeDataGenerator.java +++ b/src/org/ohdsi/whiteRabbit/fakeDataGenerator/FakeDataGenerator.java @@ -37,19 +37,19 @@ import org.ohdsi.whiteRabbit.DbSettings; public class FakeDataGenerator { - + private RichConnection connection; private DbType dbType; private OneToManySet primaryKeyToValues; private int maxRowsPerTable = 1000; - + private static int REGULAR = 0; private static int RANDOM = 1; private static int PRIMARY_KEY = 2; - + public static void main(String[] args) { FakeDataGenerator fakeDataGenerator = new FakeDataGenerator(); - + DbSettings dbSettings = new DbSettings(); dbSettings.dataType = DbSettings.DATABASE; dbSettings.dbType = DbType.MYSQL; @@ -65,30 +65,30 @@ public static void main(String[] args) { // dbSettings.database = "CDM_THIN"; // fakeDataGenerator.generateData(dbSettings, "S:/Data/THIN/ScanReport.xlsx"); } - + public void generateData(DbSettings dbSettings, int maxRowsPerTable, String filename) { StringUtilities.outputWithTime("Starting creation of fake data"); System.out.println("Loading scan report from " + filename); Database database = Database.generateModelFromScanReport(filename); - + dbType = dbSettings.dbType; this.maxRowsPerTable = maxRowsPerTable; - + connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType); connection.use(dbSettings.database); - + findValuesForPrimaryKeys(database); - + for (Table table : database.getTables()) { System.out.println("Generating table " + table.getName()); createTable(table); populateTable(table); } - + connection.close(); StringUtilities.outputWithTime("Done"); } - + private void findValuesForPrimaryKeys(Database database) { Set primaryKeys = new HashSet(); for (Table table : database.getTables()) { @@ -98,7 +98,7 @@ private void findValuesForPrimaryKeys(Database database) { } } } - + primaryKeyToValues = new OneToManySet(); for (Table table : database.getTables()) { for (Field field : table.getFields()) { @@ -110,7 +110,7 @@ private void findValuesForPrimaryKeys(Database database) { } } } - + private void populateTable(Table table) { String[] fieldNames = new String[table.getFields().size()]; ValueGenerator[] valueGenerators = new ValueGenerator[table.getFields().size()]; @@ -132,7 +132,7 @@ private void populateTable(Table table) { } connection.insertIntoTable(rows.iterator(), table.getName(), false); } - + private void createTable(Table table) { StringBuilder sql = new StringBuilder(); sql.append("CREATE TABLE " + table.getName() + " (\n"); @@ -145,25 +145,48 @@ private void createTable(Table table) { sql.append("\n);"); connection.execute(sql.toString()); } - + private String correctType(Field field) { + String type = field.getType(); if (dbType == DbType.MYSQL) { - if (field.getType().toUpperCase().equals("VARCHAR")) + if (isVarChar(type)) return "VARCHAR(" + field.getMaxLength() + ")"; - else if (field.getType().toUpperCase().equals("INT")) + else if (isInt(type)) return "BIGINT"; - else if (field.getType().equals("Real")) + else if (isNumber(type)) return "DOUBLE"; - else if (field.getType().equals("Empty")) + else if (isText(type)) + return "TEXT"; + else if (type.equals("EMPTY")) return "VARCHAR(255)"; else - return field.getType(); + return type; } return null; } - + + private boolean isVarChar(String type) { + type = type.toUpperCase(); + return (type.equals("VARCHAR") || type.equals("VARCHAR2") || type.equals("CHARACTER VARYING")); + } + + private boolean isInt(String type) { + type = type.toUpperCase(); + return (type.equals("INT") || type.equals("INTEGER") || type.equals("BIGINT")); + } + + private boolean isNumber(String type) { + type = type.toUpperCase(); + return (type.equals("REAL") || type.equals("DOUBLE") || type.equals("NUMBER") || type.equals("FLOAT") || type.equals("DOUBLE PRECISION")); + } + + private boolean isText(String type) { + type = type.toUpperCase(); + return (type.equals("TEXT") || type.equals("CLOB")); + } + private class ValueGenerator { - + private String[] values; private int[] cumulativeFrequency; private int totalFrequency; @@ -172,7 +195,7 @@ private class ValueGenerator { private int cursor; private int generatorType = REGULAR; private Random random = new Random(); - + public ValueGenerator(Field field) { String[][] valueCounts = field.getValueCounts(); type = field.getType(); @@ -190,21 +213,21 @@ public ValueGenerator(Field field) { int length = valueCounts.length; if (valueCounts[length - 1][1].equals("")) // Last value could be "List truncated..." length--; - + values = new String[length]; cumulativeFrequency = new int[length]; totalFrequency = 0; for (int i = 0; i < length; i++) { int frequency = (int) (Double.parseDouble(valueCounts[i][1])); totalFrequency += frequency; - + values[i] = valueCounts[i][0]; cumulativeFrequency[i] = totalFrequency; } generatorType = REGULAR; } } - + private String[] convertToArray(Set set) { String[] array = new String[set.size()]; int i = 0; @@ -212,15 +235,15 @@ private String[] convertToArray(Set set) { array[i++] = item; return array; } - + public String generate() { if (generatorType == RANDOM) { // Random generate a string: - if (type.equals("VarChar")) { + if (isVarChar(type)) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < length; i++) sb.append(Character.toChars(65 + random.nextInt(26))); return sb.toString(); - } else if (type.equals("Integer")) { + } else if (isInt(type)) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < length; i++) sb.append(Character.toChars(48 + random.nextInt(10))); @@ -250,5 +273,5 @@ else if (type.equals("Empty")) } } } - + } diff --git a/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java b/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java index dda5b32c..7b3c5ed7 100644 --- a/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java +++ b/src/org/ohdsi/whiteRabbit/scan/SourceDataScan.java @@ -46,20 +46,20 @@ import org.ohdsi.whiteRabbit.DbSettings; public class SourceDataScan { - + public static int MAX_VALUES_IN_MEMORY = 100000; public static int MAX_VALUES_TO_REPORT = 25000; public static int MIN_CELL_COUNT_FOR_CSV = 1000000; public static int N_FOR_FREE_TEXT_CHECK = 1000; public static int MIN_AVERAGE_LENGTH_FOR_FREE_TEXT = 100; - + private char delimiter = ','; private int sampleSize; private boolean scanValues; private int minCellCount; private DbType dbType; private String database; - + public static void main(String[] args) { // DbSettings dbSettings = new DbSettings(); // dbSettings.dataType = DbSettings.DATABASE; @@ -71,7 +71,7 @@ public static void main(String[] args) { // dbSettings.password = "F1r3starter"; // SourceDataScan scan = new SourceDataScan(); // scan.process(dbSettings, 1000000, true, 25, "s:/data/ScanReport.xlsx"); - + // DbSettings dbSettings = new DbSettings(); // dbSettings.dataType = DbSettings.DATABASE; // dbSettings.dbType = DbType.ORACLE; @@ -82,7 +82,7 @@ public static void main(String[] args) { // dbSettings.password = "F1r3starter"; // SourceDataScan scan = new SourceDataScan(); // scan.process(dbSettings, 1000000, "s:/data/ScanReport.xlsx"); - + // DbSettings dbSettings = new DbSettings(); // dbSettings.dataType = DbSettings.DATABASE; // dbSettings.dbType = DbType.MSSQL; @@ -94,7 +94,7 @@ public static void main(String[] args) { // dbSettings.tables.add("core"); // SourceDataScan scan = new SourceDataScan(); // scan.process(dbSettings, 1000000, "s:/data/ScanReport.xlsx"); - + // DbSettings dbSettings = new DbSettings(); // dbSettings.dataType = DbSettings.DATABASE; // dbSettings.dbType = DbType.MYSQL; @@ -106,7 +106,7 @@ public static void main(String[] args) { // dbSettings.tables.add("provider"); // SourceDataScan scan = new SourceDataScan(); // scan.process(dbSettings, 100000, true, 25, "c:/temp/ScanReport.xlsx"); - + DbSettings dbSettings = new DbSettings(); dbSettings.dataType = DbSettings.CSVFILES; dbSettings.delimiter = ','; @@ -115,7 +115,7 @@ public static void main(String[] args) { SourceDataScan scan = new SourceDataScan(); scan.process(dbSettings, 100000, false, 25, "c:/temp/ScanReport.xlsx"); } - + public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, int minCellCount, String filename) { this.sampleSize = sampleSize; this.scanValues = scanValues; @@ -129,25 +129,25 @@ public void process(DbSettings dbSettings, int sampleSize, boolean scanValues, i tableToFieldInfos = processDatabase(dbSettings); generateReport(tableToFieldInfos, filename); } - + private Map> processDatabase(DbSettings dbSettings) { Map> tableToFieldInfos = new HashMap>(); RichConnection connection = new RichConnection(dbSettings.server, dbSettings.domain, dbSettings.user, dbSettings.password, dbSettings.dbType); connection.setVerbose(false); connection.use(dbSettings.database); - + dbType = dbSettings.dbType; database = dbSettings.database; - + for (String table : dbSettings.tables) { List fieldInfos = processDatabaseTable(table, connection); tableToFieldInfos.put(table, fieldInfos); } - + connection.close(); return tableToFieldInfos; } - + private Map> processCsvFiles(DbSettings dbSettings) { delimiter = dbSettings.delimiter; Map> tableToFieldInfos = new HashMap>(); @@ -158,15 +158,15 @@ private Map> processCsvFiles(DbSettings dbSettings) { } return tableToFieldInfos; } - + private void generateReport(Map> tableToFieldInfos, String filename) { System.out.println("Generating scan report"); removeEmptyTables(tableToFieldInfos); List tables = new ArrayList(tableToFieldInfos.keySet()); Collections.sort(tables); - + SXSSFWorkbook workbook = new SXSSFWorkbook(100); // keep 100 rows in memory, exceeding rows will be flushed to disk - + // Create overview sheet Sheet sheet = workbook.createSheet("Overview"); if (!scanValues) { @@ -184,7 +184,7 @@ private void generateReport(Map> tableToFieldInfos, Stri Long.valueOf(fieldInfo.rowCount), Long.valueOf(fieldInfo.nProcessed), fieldInfo.getFractionEmpty()); addRow(sheet, ""); } - + // Create per table sheets for (String table : tables) { sheet = workbook.createSheet(table); @@ -223,7 +223,7 @@ private void generateReport(Map> tableToFieldInfos, Stri tableToFieldInfos.remove(table); } } - + try { FileOutputStream out = new FileOutputStream(new File(filename)); workbook.write(out); @@ -233,7 +233,7 @@ private void generateReport(Map> tableToFieldInfos, Stri throw new RuntimeException(e.getMessage()); } } - + private void removeEmptyTables(Map> tableToFieldInfos) { Iterator>> iterator = tableToFieldInfos.entrySet().iterator(); while (iterator.hasNext()) { @@ -241,16 +241,16 @@ private void removeEmptyTables(Map> tableToFieldInfos) { iterator.remove(); } } - + private List processDatabaseTable(String table, RichConnection connection) { StringUtilities.outputWithTime("Scanning table " + table); - + long rowCount = connection.getTableSize(table); if (rowCount == 0) return new ArrayList(); - + List fieldInfos = fetchTableStructure(connection, rowCount, table); - + if (scanValues) { int actualCount = 0; QueryResult queryResult = fetchRowsFromTable(connection, table, rowCount); @@ -267,17 +267,17 @@ private List processDatabaseTable(String table, RichConnection connec for (FieldInfo fieldInfo : fieldInfos) fieldInfo.trim(); } - + return fieldInfos; } - + private QueryResult fetchRowsFromTable(RichConnection connection, String table, long rowCount) { String query; if (dbType == DbType.MSSQL) query = "SELECT * FROM [" + table + "]"; else query = "SELECT * FROM " + table; - + if (sampleSize != -1) { if (dbType == DbType.MSSQL) query += " TABLESAMPLE (" + sampleSize + " ROWS)"; @@ -292,21 +292,25 @@ else if (dbType == DbType.ORACLE) { } else if (dbType == DbType.POSTGRESQL) query += " ORDER BY RANDOM() LIMIT " + sampleSize; } - // System.out.println("SQL: " + query); + //System.out.println("SQL: " + query); return connection.query(query); - + } - + private List fetchTableStructure(RichConnection connection, long rowCount, String table) { - String query; - if (dbType == DbType.ORACLE || dbType == DbType.MSSQL) + String query = null; + if (dbType == DbType.ORACLE) + query = "SELECT COLUMN_NAME,DATA_TYPE FROM ALL_TAB_COLUMNS WHERE table_name = '" + table + "' AND owner = '" + database.toUpperCase() + "'"; + else if (dbType == DbType.MSSQL) query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_CATALOG='" + database + "' AND TABLE_NAME='" + table + "';"; - else - // mysql + else if (dbType == DbType.MYSQL) query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table + "';"; - + else if (dbType == DbType.POSTGRESQL) + query = "SELECT COLUMN_NAME,DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '" + database + "' AND TABLE_NAME = '" + table + "';"; + List fieldInfos = new ArrayList(); for (org.ohdsi.utilities.files.Row row : connection.query(query)) { + row.upperCaseFieldNames(); FieldInfo fieldInfo = new FieldInfo(row.get("COLUMN_NAME")); fieldInfo.type = row.get("DATA_TYPE"); fieldInfo.rowCount = rowCount; @@ -314,7 +318,7 @@ private List fetchTableStructure(RichConnection connection, long rowC } return fieldInfos; } - + private List processCsvFile(String filename) { StringUtilities.outputWithTime("Scanning table " + filename); List fieldInfos = new ArrayList(); @@ -343,10 +347,10 @@ private List processCsvFile(String filename) { } for (FieldInfo fieldInfo : fieldInfos) fieldInfo.trim(); - + return fieldInfos; } - + private class FieldInfo { public String type; public String name; @@ -361,20 +365,20 @@ private class FieldInfo { public boolean isDate = true; public boolean isFreeText = false; public boolean tooManyValues = false; - + public FieldInfo(String name) { this.name = name; } - + public void trim() { if (valueCounts.size() > MAX_VALUES_TO_REPORT) valueCounts.keepTopN(MAX_VALUES_TO_REPORT); } - + public Double getFractionEmpty() { return emptyCount / (double) nProcessed; } - + public String getTypeDescription() { if (type != null) return type; @@ -391,20 +395,20 @@ else if (isReal) else return "varchar"; } - + public void processValue(String value) { String trimValue = value.trim(); nProcessed++; sumLength += value.length(); if (value.length() > maxLength) maxLength = value.length(); - + if (trimValue.length() == 0) emptyCount++; - + if (!isFreeText) { valueCounts.add(value); - + if (trimValue.length() != 0) { if (isReal && !StringUtilities.isNumber(trimValue)) isReal = false; @@ -430,17 +434,17 @@ public void processValue(String value) { for (String word : StringUtilities.mapToWords(trimValue.toLowerCase())) valueCounts.add(word); } - + if (!tooManyValues && valueCounts.size() > MAX_VALUES_IN_MEMORY) { tooManyValues = true; valueCounts.keepTopN(MAX_VALUES_TO_REPORT); } } - + public List> getSortedValuesWithoutSmallValues() { boolean truncated = false; List> result = new ArrayList>(); - + for (Map.Entry entry : valueCounts.key2count.entrySet()) { if (entry.getValue().count < minCellCount) truncated = true; @@ -452,7 +456,7 @@ public List> getSortedValuesWithoutSmallValues() { } } } - + Collections.sort(result, new Comparator>() { public int compare(Pair o1, Pair o2) { return o2.getItem2().compareTo(o1.getItem2()); @@ -463,17 +467,17 @@ public int compare(Pair o1, Pair o2) { return result; } } - + private void addRow(Sheet sheet, Object... values) { Row row = sheet.createRow(sheet.getPhysicalNumberOfRows()); for (Object value : values) { Cell cell = row.createCell(row.getPhysicalNumberOfCells()); - + if (value instanceof Integer || value instanceof Long || value instanceof Double) cell.setCellValue(Double.parseDouble(value.toString())); else cell.setCellValue(value.toString()); - + } } }