using System;
using System.Diagnostics;
using System.Drawing;
using System.Collections;
using System.IO;
using System.ComponentModel;
using System.Windows.Forms;
using mshtml;
using SHDocVw;
using System.Text.RegularExpressions;
using System.Data.OleDb;
namespace WebScraper
{
/// <summary>
/// This is a minimalist interface for controlling the financial data scrapers. It provides
/// facilities for selecting and initiating scavenging operations, logic for processing the
/// retrieved data, and limited progress feedback.
/// </summary>
public class WebScraper : System.Windows.Forms.Form
{
/// <summary>
/// This string constant defines the DBMS connection string. Modify the path and or name as neccessary
/// </summary>
const string DBMSConnection = @"Provider=Microsoft.Jet.OLEDB.4.0;Password="""";User ID=Admin;Data Source=CorporateData.mdb;Mode=Share Deny None;Extended Properties="""";Jet OLEDB:System database="""";Jet OLEDB:Registry Path="""";Jet OLEDB:Database Password="""";Jet OLEDB:Engine Type=5;Jet OLEDB:Database Locking Mode=0;Jet OLEDB:Global Partial Bulk Ops=2;Jet OLEDB:Global Bulk Transactions=1;Jet OLEDB:New Database Password="""";Jet OLEDB:Create System Database=False;Jet OLEDB:Encrypt Database=False;Jet OLEDB:Don't Copy Locale on Compact=False;Jet OLEDB:Compact Without Replica Repair=False;Jet OLEDB:SFP=False";
/// <summary>
/// Name of the file containing the list of ticker symbols
/// </summary>
const string TickerSymbolFile = "tickers.txt";
/// <summary>
/// Email address for where bugs (top level exception catches) should be sent. Only
/// applies to release builds.
/// </summary>
const string ProgramSupportEmail = "mark@vibrant3d.com";
/// <summary>
/// Defines the operating mode of the scraper
/// </summary>
private enum ScraperOperatingMode { Stopped,Started,Paused };
/// <summary>
/// Defines exactly what the scanner is doing while it is operating
/// </summary>
private enum ScraperOperatingState { StartStatement,InStatement,EndStatement };
/// <summary>
/// Defines the type of scraped data, which indicates how it will be converted from
/// textual to internal form.
/// </summary>
public enum ScrapeDataType { Dates,Dollars,Numbers};
/// <summary>
/// Defines individual types of financial records.
/// </summary>
public enum FinancialStatements {NoStatement=-1,BalanceSheet=0,CashFlows=1,Income=2,Equity=3,Trading=4,UserExpressions = 5,NumItems=6};
/// <summary>
/// Defines the global operating mode of the scraper to control behavior in the idle dispatch loop
/// </summary>
private ScraperOperatingMode m_scraperOperatingMode = ScraperOperatingMode.Stopped;
/// <summary>
/// Defines the internal process state when the scraper is running
/// </summary>
private ScraperOperatingState m_scraperOperatingState;
/// <summary>
/// Defines the current statement type being scraped
/// </summary>
private FinancialStatements m_scraperActiveStatement;
/// <summary>
/// Defines the numeric index of the scraper being indexed
/// </summary>
private int m_scraperRecordIndex;
/// <summary>
/// Current statement being processed by the scanner
/// </summary>
private int m_scraperStatementIndex;
/// <summary>
/// List of ticker symbols that will drive scraper retrieval operations
/// </summary>
private ArrayList m_symbols;
/// <summary>
/// Number of records that could not be retrieved in the current pass
/// </summary>
private int m_numFailures;
/// <summary>
/// Number of statements the user has selected for scraping
/// </summary>
private int m_numStatementsToScrape;
/// <summary>
/// Reference to browser instance used to handle HTTP download and DOM text extraction
/// </summary>
private InternetExplorer m_browser;
/// <summary>
/// The time scavenging started. Note that these values all reset for
/// each class of operation, i.e. if you select CashFlow and Equity,
/// the calculations about elapsed and remaining time are done for
/// each item individually, not for your collective selection. You
/// may elect to modify this
/// </summary>
/// <summary>
/// Start of statement scraping cycle
/// </summary>
private DateTime m_startScrapeTime;
/// <summary>
/// Start of individual item scraping within a statement
/// </summary>
private DateTime m_startItemScrapeTime;
#region Windows Forms Designer created variables
private System.ComponentModel.IContainer components;
private System.Windows.Forms.GroupBox groupBox1;
private System.Windows.Forms.CheckBox GetBalanceSheet;
private System.Windows.Forms.CheckBox GetCashFlow;
private System.Windows.Forms.CheckBox GetEquityInfo;
private System.Windows.Forms.CheckBox GetIncomeStatement;
private System.Windows.Forms.ProgressBar progressBar1;
private System.Windows.Forms.Label performanceMsg;
private System.Windows.Forms.Label CurrentTickerDisplay;
private System.Windows.Forms.Label CurrentRecordDisplay;
private System.Windows.Forms.Label TotalRecordsDisplay;
private System.Windows.Forms.Label TotalRecoveryDisplay;
private System.Windows.Forms.Label SuccessPercentage;
private System.Windows.Forms.Label SecondsPerRecord;
private System.Windows.Forms.Label ElapsedTimeDisplay;
private System.Windows.Forms.Label RemainingTimeDisplay;
private System.Windows.Forms.Button startButton;
private System.Windows.Forms.Button stopButton;
private System.Windows.Forms.Button exitButton;
private System.Windows.Forms.Label label3;
private System.Windows.Forms.Label label1;
private System.Windows.Forms.Label label2;
private System.Windows.Forms.Label label5;
private System.Windows.Forms.Label label4;
private System.Windows.Forms.Label label6;
private System.Windows.Forms.Label label9;
private System.Windows.Forms.ToolTip toolTip1;
private System.Data.OleDb.OleDbConnection MarketDBMS;
private System.Windows.Forms.Timer timer1;
#endregion
public WebScraper()
{
//
// Required for Windows Form Designer support
//
InitializeComponent();
// clear the count of selected statements
m_numStatementsToScrape = 0;
// initialize the list of ticker symbols that will be used for retrieving data
// get the (potentially) ordered list of ticker symbols
ArrayList gd = loadTickerSymbols(TickerSymbolFile);
// randomize its contents and use that list to drive retrievals
m_symbols = randomizeList(gd);
// setup the database connection - as usual, in their infinite wisdom, microsoft shows poor to nonexistant
// skills in factoring their components, i.e. the visual setup for databases ends up binding in host platform
// dependencies in exactly the same way as the 'references' mechanism in the solution view. This line makes it
// easy to compensate for their foolishness when moving the software between various development systems
this.MarketDBMS.ConnectionString =DBMSConnection;
Application.Idle += new System.EventHandler(doOperatingCycle);
}
private void startButton_Click(object sender, System.EventArgs e)
{
if(m_scraperOperatingMode == ScraperOperatingMode.Stopped)
{
// start scavenging operations
MarketDBMS.Open();
m_browser = new InternetExplorer();
m_scraperOperatingMode = ScraperOperatingMode.Started;
m_scraperOperatingState = ScraperOperatingState.StartStatement;
// reset the state index
m_scraperStatementIndex = 0;
// update the total # of records display
TotalRecordsDisplay.Text = m_symbols.Count.ToString();
forceFieldUpdate(TotalRecordsDisplay);
this.startButton.Text = "Pause";
this.stopButton.Enabled = true;
}
else if(m_scraperOperatingMode == ScraperOperatingMode.Started)
{
// pause the scavenging operations
this.performanceMsg.Text = "";
m_scraperOperatingMode = ScraperOperatingMode.Paused;
this.startButton.Text = "Continue";
}
else if(m_scraperOperatingMode == ScraperOperatingMode.Paused)
{
// restart the scavenging operation
m_scraperOperatingMode = ScraperOperatingMode.Started;
this.startButton.Text = "Pause";
}
}
private string m_runDisplayMessage = "STARTING";
/// <summary>
/// This is the root function that drives the scavenging process. It's triggered
/// by the user pressing the start button, and gathers the various information
/// set in the UI to configure and drive the scavenging process
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void doOperatingCycle(object sender, System.EventArgs e)
{
if(m_scraperOperatingMode != ScraperOperatingMode.Started)
return;
updateMessage(m_runDisplayMessage);
switch(m_scraperOperatingState)
{
case ScraperOperatingState.StartStatement :
m_runDisplayMessage = "STARTING";
// clear the recovered record count
TotalRecoveryDisplay.Text = "0";
forceFieldUpdate(TotalRecoveryDisplay);
// reset the recovered records display count and percentage. This has to be
// done here because it only updates on successful recoveries
m_startScrapeTime = DateTime.Now;
SuccessPercentage.Text = "0";
forceFieldUpdate(SuccessPercentage);
// clear the internal failure count
m_numFailures = 0;
m_scraperRecordIndex = 0;
switch(m_scraperStatementIndex)
{
case 0 :
if(this.GetBalanceSheet.Checked)
{
m_scraperActiveStatement = FinancialStatements.BalanceSheet;
m_scraperOperatingState = ScraperOperatingState.InStatement;
m_runDisplayMessage = "BALANCE SHEET";
}
else
++m_scraperStatementIndex;
break;
case 1 :
if(this.GetCashFlow.Checked)
{
m_scraperActiveStatement = FinancialStatements.CashFlows;
m_scraperOperatingState = ScraperOperatingState.InStatement;
m_runDisplayMessage = "CASH FLOW";
}
else
++m_scraperStatementIndex;
break;
case 2 :
if(this.GetIncomeStatement.Checked)
{
m_scraperActiveStatement = FinancialStatements.Income;
m_scraperOperatingState = ScraperOperatingState.InStatement;
m_runDisplayMessage = "INCOME STMNT";
}
else
++m_scraperStatementIndex;
break;
case 3 :
if(this.GetEquityInfo.Checked)
{
m_scraperActiveStatement = FinancialStatements.Equity;
m_scraperOperatingState = ScraperOperatingState.InStatement;
m_runDisplayMessage = "EQUITY INFO";
}
else
stopScraper();
break;
}
break;
case ScraperOperatingState.InStatement :
if(m_scraperRecordIndex >= m_symbols.Count)
m_scraperOperatingState = ScraperOperatingState.EndStatement;
else
scrapeFinancialInformation(m_browser,m_scraperActiveStatement,m_scraperRecordIndex++);
break;
case ScraperOperatingState.EndStatement :
m_runDisplayMessage = "FINISH STMNT";
if(++m_scraperStatementIndex < 4)
m_scraperOperatingState = ScraperOperatingState.StartStatement;
else
stopScraper();
break;
}
}
/// <summary>
/// Top level scavenging method. This function handles scavenging for all generically parsed records. It deals with
/// a single record at a time and is repeatedly dispatched to process all records. When it completes processing, it
/// will set the operating state to EndStatement
/// </summary>
/// <param name="ie">Browser instance, used to download code and extract text from the DOM</param>
/// <param name="scrapeSource">Specific data type to be scraped</param>
private void scrapeFinancialInformation(InternetExplorer ie,FinancialStatements scrapeSource,int recordIndex)
{
bool rslt = processRecord(ie,scrapeSource,(string) m_symbols[recordIndex]);
// update the current ticker symbol display
updateProgressMonitorFields(recordIndex,rslt);
}
/// <summary>
/// This is the generic top level function that handles scavenging individual company records.
/// It first determines the correct URL to retrieve data from, and then directs the browser
/// to this location. If the browser returns sufficient data to indicate the result was
/// successful it then attempts to parse the returned data
/// </summary>
/// <param name="ie">Reference to web browser instance used to handle basic HTTP connection</param>
/// <param name="srt">The type of financial statement being processed</param>
/// <param name="sym">The current symbol being processed</param>
/// <returns>Boolean indicating whether or not a record was successfully retrieved</returns>
private bool processRecord(InternetExplorer ie,FinancialStatements srt,string sym)
{
string theURL = "";
switch(srt)
{
case FinancialStatements.BalanceSheet :
theURL = balanceSheetURL(sym);
break;
case FinancialStatements.Income :
theURL = incomeStatementURL(sym);
break;
case FinancialStatements.CashFlows :
theURL = cashFlowURL(sym);
break;
case FinancialStatements.Equity :
theURL = equityStatementURL(sym);
break;
}
string theBody = explorerRead(theURL,ie);
bool rslt = false;
if(theBody != null && theBody.Length > 1500)
{
switch(srt)
{
case FinancialStatements.BalanceSheet :
rslt = processBalanceSheetRecord(sym,theBody);
break;
case FinancialStatements.Income :
rslt = processIncomeRecord(sym,theBody);
break;
case FinancialStatements.CashFlows :
rslt = processCashFlowRecord(sym,theBody);
break;
case FinancialStatements.Equity :
rslt = processEquityRecord(sym,theBody);
break;
}
}
return rslt;
}
/// <summary>
/// Updates the display controls during the scavenging process. This provides
/// the user with information on the progress of their scavenging operation. It
/// updates the progress bar, the various record counts, and the elapsed and
/// remaining time displays
/// </summary>
/// <param name="currentIndex">Current record index</param>
/// <param name="lastResultSuccessful">true if the last record was successfully retrieved</param>
private void updateProgressMonitorFields(int currentIndex,bool lastResultSuccessful)
{
// update the ticker field display
CurrentTickerDisplay.Text = (string) m_symbols[currentIndex];
if(lastResultSuccessful)
CurrentTickerDisplay.BackColor = Color.LightGreen;
else
CurrentTickerDisplay.BackColor = Color.LightPink;
CurrentTickerDisplay.Invalidate(true);
CurrentTickerDisplay.Update();
// update the progress display
float v1 = (float) currentIndex / (float) m_symbols.Count;
progressBar1.Value = (int) (v1 * 1000.0);
forceFieldUpdate(progressBar1);
// update the current record display
CurrentRecordDisplay.Text = currentIndex.ToString();
forceFieldUpdate(CurrentRecordDisplay);
// update the total # successfully recovered and the success percentage
// if the last record was successfully recovered
if(lastResultSuccessful)
{
// success count
TotalRecoveryDisplay.Text = (currentIndex + 1 - m_numFailures).ToString();
forceFieldUpdate(TotalRecoveryDisplay);
// success percentage
float pctg = ((float) currentIndex - m_numFailures) / (float) currentIndex;
SuccessPercentage.Text = (pctg * 100.0).ToString("0.00");
forceFieldUpdate(SuccessPercentage);
}
else // count the failure
m_numFailures++;
// update the per record, elapsed, and remaining times
DateTime endItemScrapeTime = DateTime.Now;
TimeSpan recordTime = endItemScrapeTime - m_startItemScrapeTime;
SecondsPerRecord.Text = (recordTime.TotalMilliseconds / 1000.0).ToString("0.00");
forceFieldUpdate(SecondsPerRecord);
TimeSpan elapsedTime = endItemScrapeTime - m_startScrapeTime;
ElapsedTimeDisplay.Text = timespanString(elapsedTime);
forceFieldUpdate(ElapsedTimeDisplay);
// computing the remaining time is a guess - we just use the last record retrieval time
// and multiply this by the number of remaining records, but the more adventurous could
// implement some type of averaging function to get a more stable estimate
TimeSpan estRemTime = new TimeSpan(recordTime.Ticks * (m_symbols.Count - currentIndex));
RemainingTimeDisplay.Text = timespanString(estRemTime);
forceFieldUpdate(RemainingTimeDisplay);
m_startItemScrapeTime = endItemScrapeTime;
}
/// <summary>
/// Generate the url for retrieving a specific balance sheet
/// </summary>
/// <param name="sym">Symbol of equity to return data for</param>
/// <returns>URL of balance sheet for given equity</returns>
private string balanceSheetURL(string sym)
{
string firstLetter = sym.Substring(0,1);
return "http://biz.yahoo.com/fin/l/" + firstLetter + "/" + sym + "_qb.html";
}
/// <summary>
/// Generate the url for retrieving a specific cash flow statement
/// </summary>
/// <param name="sym">Symbol of equity to return data for</param>
/// <returns>URL of cash flow statement for given equity</returns>
private string cashFlowURL(string sym)
{
string firstLetter = sym.Substring(0,1);
return "http://biz.yahoo.com/fin/l/" + firstLetter + "/" + sym + "_qc.html";
}
/// <summary>
/// Generate the url for retrieving a specific income statement
/// </summary>
/// <param name="sym">Symbol of equity to return data for</param>
/// <returns>URL of income statement for given equity</returns>
private string incomeStatementURL(string sym)
{
string firstLetter = sym.Substring(0,1);
return "http://biz.yahoo.com/fin/l/" + firstLetter + "/" + sym + ".html";
}
/// <summary>
/// Generate the url for retrieving a specific equity information page
/// </summary>
/// <param name="sym">Symbol of equity to return data for</param>
/// <returns>URL of equity information page</returns>
private string equityStatementURL(string sym)
{
string firstLetter = sym.Substring(0,1);
return "http://biz.yahoo.com/p/" + firstLetter + "/" + sym + ".html";
}
/// <summary>
/// Method for reading a specific web page and extracting the text content of the page
/// </summary>
/// <param name="web_page_address">string giving the URL the browser is to navigate to</param>
/// <param name="ie">browser instance</param>
/// <returns>String containing the text contents of the page or null</returns>
private string explorerRead(string web_page_address,InternetExplorer ie)
{
System.Object nullObject = 0;
System.Object nullObjStr = "";
try
{
// set a five second delay on the timer
timer1.Interval = 5000;
timer1.Start();
ie.Navigate(web_page_address,ref nullObject, ref nullObjStr, ref nullObjStr, ref nullObjStr);
}
catch(Exception e)
{
timer1.Stop();
return null;
}
int ctr = 0;
while( ie.ReadyState!=tagREADYSTATE.READYSTATE_COMPLETE )
{
Application.DoEvents();
System.Threading.Thread.Sleep(100);
if(++ctr > 25)
{
timer1.Stop();
}
}
// stop the timer
timer1.Stop();
// extract and return the text contents of the retrieved page
return (ie.Document as IHTMLDocument2).body.outerText;
}
/// <summary>
/// Setup for generic parsing of balance sheet data and database update with scraped data
/// </summary>
/// <param name="sym">Symbol of equity the page data is for</param>
/// <param name="theBodyIn">The raw page data</param>
/// <returns>Boolean indicating whether parse and dbms update were successful</returns>
private bool processBalanceSheetRecord(string sym, string theBodyIn)
{
// match patterns for terminal elements
ItemParseDirective[]
directives = {
new ItemParseDirective(FinancialStatements.BalanceSheet,"POSTDATE",@"\s*Period Ending(?<POSTDATE>.+)",ScrapeDataType.Dates,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CASH",@"\s*Cash And Cash Equivalents(?<CASH>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"STI",@"\s*Short Term Investments(?<STI>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"NR",@"\s*Net Receivables(?<NR>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"I",@"\s*Inventory(?<I>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OCA",@"\s*Other Current Assets(?<OCA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"LTI",@"\s*Long Term Investments(?<LTI>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"TA",@"\s*Property Plant And Equipment(?<TA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"GW",@"\s*Goodwill(?<GW>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"IA",@"\s*Intangible Assets(?<IA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"AA",@"\s*Accumulated Amortization(?<AA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OA",@"\s*Other Assets(?<OA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"DLTAC",@"\s*Deferred Long Term Asset Charges(?<DLTAC>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"AP",@"\s*Payables And Accrued Expenses(?<AP>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"DPAY",@"\s*Short Term And Current Long Term Debt(?<DPAY>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OCL",@"\s*Other Current Liabilities(?<OCL>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"LTD",@"\s*Long Term Debt(?<LTD>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OL",@"\s*Other Liabilities(?<OL>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"DLTLC",@"\s*Deferred Long Term Liability Charges(?<DLTLC>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"MI",@"\s*Minority Interest(?<MI>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"NG",@"\s*Negative Goodwill(?<NG>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OW",@"\s*Misc Stocks Options Warrants(?<OW>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"RPS",@"\s*Redeemable Preferred Stock(?<RPS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"PS",@"\s*Preferred Stock(?<PS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CS",@"\s*Common Stock(?<CS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"RE",@"\s*Retained Earnings(?<RE>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"TS",@"\s*Treasury Stock(?<TS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CAPS",@"\s*Capital Surplus(?<CAPS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OSE",@"\s*Other Stockholder Equity(?<OSE>.+)",ScrapeDataType.Dollars,4),
};
int numColumsRead = 0;
Hashtable bsd = processGenericRecord(theBodyIn,directives,ref numColumsRead);
if(bsd.Count > 0)
{
writeGenericRecord("QuarterlyBalance",sym,bsd,numColumsRead);
return true;
}
else
return false;
}
/// <summary>
/// Setup for generic parsing of cash flow statement data and database update with scraped data
/// </summary>
/// <param name="sym">Symbol of equity the page data is for</param>
/// <param name="theBodyIn">The raw page data</param>
/// <returns>Boolean indicating whether parse and dbms update were successful</returns>
private bool processCashFlowRecord(string sym, string theBodyIn)
{
ItemParseDirective[]
directives = {
new ItemParseDirective(FinancialStatements.BalanceSheet,"POSTDATE",@"\s*Period Ending:(?<POSTDATE>.+)",ScrapeDataType.Dates,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"DEPR",@"\s*Depreciation(?<DEPR>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"ADNI",@"\s*Adjustments To Net Income(?<ADNI>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CAR",@"\s*Changes In Accounts Receivables(?<CAR>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CL",@"\s*Changes In Liabilities(?<CL>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CI",@"\s*Changes In Inventories(?<CI>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"COOA",@"\s*Changes In Other Operating Activities(?<COOA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"CE",@"\s*Capital Expenditures(?<CE>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"INV",@"\s*Investments(?<INV>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OCFIA",@"\s*Other Cashflows From Investing Activities(?<OCFIA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"DP",@"\s*Dividends Paid(?<DP>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"SPS",@"\s*Sale Purchase Of Stock(?<SPS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"NB",@"\s*Net Borrowings(?<NB>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OCFFA",@"\s*Other Cashflows From Financing Activities(?<OCFFA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"EER",@"\s*Effect Of Exchange Rate(?<EER>.+)",ScrapeDataType.Dollars,4)
};
int numColumsRead = 0;
Hashtable cfd = processGenericRecord(theBodyIn,directives,ref numColumsRead);
if(cfd.Count > 0)
{
writeGenericRecord("QuarterlyCashFlow",sym,cfd,numColumsRead);
return true;
}
else
return false;
}
/// <summary>
/// Setup for generic parsing of income statement data and database update with scraped data
/// </summary>
/// <param name="sym">Symbol of equity the page data is for</param>
/// <param name="theBodyIn">The raw page data</param>
/// <returns>Boolean indicating whether parse and dbms update were successful</returns>
private bool processIncomeRecord(string sym, string theBodyIn)
{
ItemParseDirective[]
directives = {
new ItemParseDirective(FinancialStatements.BalanceSheet,"POSTDATE",@"\s*Period Ending:(?<POSTDATE>.+)",ScrapeDataType.Dates,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"REVENUES",@"\s*Total Revenue(?<REVENUES>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"COGS",@"\s*Cost Of Revenue(?<COGS>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"RD",@"\s*Research And Development(?<RD>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"SGA",@"\s*Selling General And Administrative Expenses(?<SGA>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"NR",@"\s*Non Recurring(?<NR>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OX",@"\s*Other Operating Expenses(?<OX>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OIN",@"\s*Total Other Income And Expenses Net(?<OIN>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"INTEREST",@"\s*Interest Expense(?<INTEREST>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"IT",@"\s*Income Tax Expense(?<IT>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"NIUC",@"\s*Equity Earnings Or Loss Unconsolidated Subsidiary(?<NIUC>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"MINT",@"\s*Minority Interest(?<MINT>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"DO",@"\s*Discontinued Operations(?<DO>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"EI",@"\s*Extraordinary Items(?<EI>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"AC",@"\s*Effect Of Accounting Changes(?<AC>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"OTHER",@"\s*Other Items(?<OTHER>.+)",ScrapeDataType.Dollars,4),
new ItemParseDirective(FinancialStatements.BalanceSheet,"PSA",@"\s*Preferred Stock And Other Adjustments(?<PSA>.+)",ScrapeDataType.Dollars,4)
};
int numColumsRead = 0;
Hashtable id = processGenericRecord(theBodyIn,directives,ref numColumsRead);
if(id.Count > 0)
{
writeGenericRecord("QuarterlyIncome",sym,id,numColumsRead);
return true;
}
else
return false;
}
/// <summary>
/// This handles processing of records whose elements of interest are defined by item parse directives
/// </summary>
/// <param name="sym"></param>
/// <param name="theBodyIn"></param>
/// <param name="directives"></param>
/// <returns></returns>
private Hashtable processGenericRecord(string theBodyIn,ItemParseDirective[] directives,ref int columnsRead)
{
string theBody = theBodyIn;
theBody = theBody.Replace("\r","\r\n");
int searchoffset = 0;
string[] rslts;
string[] vars = new String[1];
Hashtable finalOutput = new Hashtable();
columnsRead = 0;
for(int i = 0;i < directives.Length;i++)
{
directives[i].m_recordID = i;
vars[0] = directives[i].m_key;
rslts = this.grabData(theBody,directives[i].m_pattern,vars,ref searchoffset);
if(rslts != null)
{
// parse the individual results
// note that columnsRead is updated ONLY from the parseDateData result. This is because this
// routine is not prone to error, given the format of dates. This logic does presuppose that
// we're always reading in date information, however
switch(directives[i].m_dataType)
{
case ScrapeDataType.Dates :
finalOutput[directives[i].m_key] = parseDateData(rslts[0].Trim(),directives[i].m_numColumns);
columnsRead = ((DateTime[]) finalOutput[directives[i].m_key]).Length;
break;
case ScrapeDataType.Dollars :
finalOutput[directives[i].m_key] = parseNumericData(rslts[0].Trim(),directives[i].m_numColumns);
break;
case ScrapeDataType.Numbers :
finalOutput[directives[i].m_key] = parseNumericData(rslts[0].Trim(),directives[i].m_numColumns);
break;
}
}
}
return finalOutput;
}
/// <summary>
/// Custom parser for equity information. This is an unrolled hand modified version of the generic parser
/// that shows a recently developed parser for some specialized information. Once the parsing logic works
/// and is stable, this is a candidate for being rewritten as another general parser, i.e. the logic is broken
/// down into support methods and an ItemParseDirectives table
/// </summary>
/// <param name="sym">The key for the record, which will be used when writing it to the database</param>
/// <param name="theBodyIn">The raw text recovered from the web page</param>
/// <returns></returns>
private bool processEquityRecord(string sym,string theBodyIn)
{
string theBody = theBodyIn;
theBody = theBody.Replace("\r","\r\n");
int searchoffset = 0;
string empPtn = @"\s*Employees.+:(?<emp>.+)";
string sctPtn = @"\s*Sector:(?<sct>.+)";
string instPtn = @"\s*Institutional:(?<inst>\s*[0-9]+)%";
string soPtn = @"Shares Outstanding(?<so>.+)";
string lsPtn = @"Last Split(?<ls>.+)";
string ftPtn = @"Float(?<ft>.+)";
string stPtn = @"Shares Short(?<ss>.+)";
string[] rslts;
string empStr,sctStr,instStr,lsStr,soStr,ftStr,ssStr;
string[] vars = new String[1];
vars[0] = "emp";rslts = this.grabData(theBody,empPtn,vars,ref searchoffset);
if(rslts != null)
empStr = rslts[0].Trim();
else
empStr = "1";
vars[0] = "sct";rslts = this.grabData(theBody,sctPtn,vars,ref searchoffset);
if(rslts != null)
sctStr = rslts[0].Trim();
else
sctStr = "Unknown";
vars[0] = "inst";rslts = this.grabData(theBody,instPtn,vars,ref searchoffset);
if(rslts != null)
instStr = rslts[0].Trim();
else
instStr = "0";
vars[0] = "so";rslts = this.grabData(theBody,soPtn,vars,ref searchoffset);
if(rslts != null)
soStr = rslts[0].Trim();
else
soStr = "0";
vars[0] = "ls";rslts = this.grabData(theBody,lsPtn,vars,ref searchoffset);
if(rslts != null)
lsStr = rslts[0].Trim();
else
lsStr = "NONE";
vars[0] = "ft";rslts = this.grabData(theBody,ftPtn,vars,ref searchoffset);
if(rslts != null)
ftStr = rslts[0].Trim();
else
ftStr = "0";
vars[0] = "ss";rslts = this.grabData(theBody,stPtn,vars,ref searchoffset);
if(rslts != null)
ssStr = rslts[0].Trim();
else
ssStr = "0";
// convert the financial values
float numEmpVal = new FinancialValue(empStr).Value;
float instOwnVal = (float) (new FinancialValue(instStr).Value / 100.0);
float sharesOutVal = new FinancialValue(soStr).Value;
float floatVal = (float) new FinancialValue(ftStr).Value;
float shortVal = new FinancialValue(ssStr).Value;
// deal with any reported split
DateTime splitDate = new DateTime(1970,1,1);
float splitFactor = 1;
if(lsStr.ToUpper() != "NONE")
{
int lsso = 0;
string facPtn = @"factor\s(?<fct>.+)\son\s(?<dat>.+)";
vars = new String[2];
vars[0] = "fct";vars[1] = "dat";
rslts = this.grabData(lsStr,facPtn,vars,ref lsso);
string fctStr = rslts[0].Trim();
string datStr = rslts[1].Trim();
splitFactor = new FinancialValue(fctStr).Value;
splitDate = DateTime.Parse(datStr);
}
// update the database
// we know this is a hack for the moment, and we're providing records for
// 1/1/2000,4/1/2000,7/1/200,10/1/2000 ... 2001 (4) 2002 (4)
DateTime writeDate = DateTime.Now;
DateTime limDate = new DateTime(2003,1,1);
while(writeDate < limDate)
{
DateTime nxtWriteDate = writeDate.AddMonths(3);
float dtoffset = -1;
float sout;
if(writeDate > splitDate)
sout = sharesOutVal;
else
{
sout = sharesOutVal / splitFactor;
if(nxtWriteDate > splitDate)
{
System.TimeSpan ts = splitDate - writeDate;
dtoffset = ts.Days;
}
}
writeCustomEquityRecord(sym,writeDate,dtoffset,numEmpVal,instOwnVal,sout,floatVal,shortVal,sctStr);
writeDate = nxtWriteDate;
}
return true;
}
/// <summary>
/// Convert one or more dates in a string to their corresponding internal form
/// </summary>
/// <param name="datastring">The raw string of dates</param>
/// <param name="nCols">The number of dates in the string</param>
/// <returns>A DateTime array containing the dates parsed into internal form</returns>
private DateTime[] parseDateData(string datastring,int nCols)
{
int len = datastring.Length;
System.Text.RegularExpressions.Regex re = new System.Text.RegularExpressions.Regex(@"\w+\s+\d+,\s+\d+");
MatchCollection theMatch = re.Matches(datastring);
if(theMatch.Count == 0)
return null;
DateTime[] a = new DateTime[theMatch.Count];
for(int i = 0;i < theMatch.Count;i++)
a[i] = DateTime.Parse(theMatch[i].Value);
return a;
}
/// <summary>
/// Convert one or more numeric values, possible with various financial symbols or specialized markers to numeric information
/// </summary>
/// <param name="datastring">The raw data to parse</param>
/// <param name="nCols">The number of distinct items expected</param>
/// <returns>An array of doubles representing the data recovered</returns>
private double[] parseNumericData(string datastring,int nCols)
{
System.Text.RegularExpressions.Regex re;
re = new System.Text.RegularExpressions.Regex("N/A");
int offset = 0;
bool isNegative = false;
bool usingParens = false;
bool dollarValue = false;
double[] results = new double[nCols];
// match N/A
// [(][$]digits,commas[)]
string formnum = "";
for(int i = 0;i < nCols;i++)
{
if(offset >= datastring.Length)
{
results[i] = 0.0;
continue;
}
if(datastring[offset] == '(')
{
isNegative = true;
usingParens = true;
offset++;
}
else if(datastring[offset] == '-')
{
isNegative = true;
usingParens = false;
offset++;
}
else if(datastring[offset] == 'N')
{
if(datastring[offset + 1] == '/' && datastring[offset + 2] == 'A')
{
results[i] = 0.0;
offset += 3;
continue;
}
}
if(datastring[offset] == '$')
{
dollarValue = true;
offset++;
}
// now iterate forward to pick up the number. It may contain commas and decimal points
// it may be neccessary to end on the first illegal char, or because the comma ordering
// is wrong
int lastcomma = -1;
bool keepscanning = true;
while(keepscanning)
{
if(offset >= datastring.Length || ((dollarValue && datastring[offset] == '$') || datastring[offset] == '(' || datastring[offset] == 'N'))
{
keepscanning = false;
continue;
}
if(datastring[offset] == ',')
{
if(lastcomma != -1)
if(offset - lastcomma > 4)
{
// need to figure this out still, theres two numbers run together
Debug.Assert(false);
}
lastcomma = offset++;
}
else if(datastring[offset] == '.')
{
Debug.Assert(false);
}
else if(datastring[offset] >= '0' && datastring[offset] <= '9')
{
formnum = formnum + datastring[offset++];
}
else if(datastring[offset] == ')' && usingParens)
{
++offset;
keepscanning = false;
continue;
}
else
{
// wtf
Debug.Assert(false);
}
}
results[i] = double.Parse(formnum) * (isNegative ? -1.0 : 1.0);
formnum = "";
}
return results;
}
/// <summary>
/// This is the generic mechanism used by all parsing components, generic and custom, to extract
/// a single item from the source text being processed. It
/// </summary>
/// <param name="bulkText">The text to be scanned for the pattern</param>
/// <param name="keyString">The regular expression used to define the pattern</param>
/// <param name="vars">A string array to receive the extracted matches</param>
/// <param name="offset">The location to start scanning, passed as a reference and updated to the next
/// unscanned position on return</param>
/// <returns>The extracted matches, same as 'vars'</returns>
private string[] grabData(string bulkText,string keyString,string[] vars,ref int offset)
{
System.Text.RegularExpressions.Regex re = new System.Text.RegularExpressions.Regex(keyString);
string substr = bulkText.Substring(offset);
Match theMatch = re.Match(bulkText,offset,bulkText.Length - offset);
if(theMatch == null)
return null;
if(!theMatch.Success)
return null;
string[] rslt = new String[vars.Length];
for(int i = 0;i < vars.Length;i++)
rslt[i] = theMatch.Result("${" + vars[i] + "}");
offset = theMatch.Index + theMatch.Length;
return rslt;
}
/// <summary>
/// Write a record to the database based on the results of a generic parsing operation
/// </summary>
/// <param name="theTable">Name of the table to write the data to</param>
/// <param name="sym">The ticker symbol used as a unique key for the record</param>
/// <param name="databucket">The set of item and value pairs</param>
private void writeGenericRecord(string theTable,string sym,Hashtable databucket,int columnsRead)
{
string[] kz = new String[databucket.Count];
IDictionaryEnumerator de = databucket.GetEnumerator();
// generate the parameterized SQL INSERT statement
int ki = 0;
while(de.MoveNext())
kz[ki++] = (string) de.Key;
string myInsertQuery = "INSERT INTO " + theTable + " ";
string myValStr = "";
for(int i = 0;i < kz.Length;i++)
{
if(i == 0)
{
myInsertQuery = myInsertQuery+ " (SYMBOL,";
myValStr = myValStr + "Values(\"" + sym + "\",";
}
else
{
myInsertQuery = myInsertQuery + ",";
myValStr = myValStr + ",";
}
myInsertQuery = myInsertQuery + kz[i];
myValStr = myValStr + "@" + kz[i].ToLower();
}
myInsertQuery = myInsertQuery + ")" + myValStr + ")";
// set parameter values and perform database update for each information column provided
for(int c = 0;c < columnsRead;c++)
{
System.Data.OleDb.OleDbCommand myCommand = new System.Data.OleDb.OleDbCommand(myInsertQuery);
for(int i = 0;i < kz.Length;i++)
{
string ky = "@" + kz[i].ToLower();
if(kz[i].Equals("POSTDATE") && !(databucket[kz[i]] is string))
{
DateTime[] dt = (DateTime[]) databucket[kz[i]];
DateTime theDate = dt[c];
Debug.Assert(theDate > new DateTime(1970,1,1));
myCommand.Parameters.Add(ky,theDate.ToShortDateString());
}
else if(!(databucket[kz[i]] is string))
{
double[] fd = (double[]) databucket[kz[i]];
double thefloat = fd[c];
myCommand.Parameters.Add(ky,thefloat.ToString());
}
else
{
string p = (string) databucket[kz[i]];
myCommand.Parameters.Add(ky,p);
}
}
try
{
myCommand.Connection = MarketDBMS;
myCommand.ExecuteNonQuery();
}
catch(OleDbException e)
{
// silently ignore duplicate record errors
if(e.ErrorCode == -2147467259)
continue;
string msg = e.ToString();
if(MessageBox.Show (msg, "Database Error",MessageBoxButtons.OKCancel, MessageBoxIcon.Asterisk) == DialogResult.Cancel)
Application.Exit();
}
catch(Exception e)
{
string msg = e.ToString();
if(MessageBox.Show (msg, "Generic Error",MessageBoxButtons.OKCancel, MessageBoxIcon.Asterisk) == DialogResult.Cancel)
Application.Exit();
}
}
}
/// <summary>
/// Write a custom record to the database containing extracted equity information. Over time, this
/// logic can be rewritten to use the generic record writing method (writeGenericRecord)
/// </summary>
/// <param name="sym">The unique ticker symbol to use as a part of the record key</param>
/// <param name="forDate">The date to use as the other part of the record key</param>
/// <param name="chgDateOffset">A floating point representation of the date change</param>
/// <param name="empcount">Number of employees</param>
/// <param name="instown">Institutional ownership percentage</param>
/// <param name="sout">Shares outstanding</param>
/// <param name="thefloat">Share Float</param>
/// <param name="shorts">Number of short sales</param>
/// <param name="sector">Sector, not currently written</param>
private void writeCustomEquityRecord(string sym,DateTime forDate,float chgDateOffset,float empcount,float instown,float sout,float thefloat,float shorts,string sector)
{
"INSERT INTO QuarterlyEquity (SYMBOL,POSTDATE,CHGDATE,SHARESOUT,FLOATOUT,SHORTED,EMPLOYEES,INSTOWN) Values(@sym,@pdt,@cdt,@sot,@flt,@srt,@emp,@ion)";
System.Data.OleDb.OleDbCommand myCommand = new System.Data.OleDb.OleDbCommand(myInsertQuery);
myCommand.Parameters.Add("@sym",sym);
myCommand.Parameters.Add("@pdt",forDate.ToShortDateString());
myCommand.Parameters.Add("@cdt",chgDateOffset.ToString());
myCommand.Parameters.Add("@sot",sout.ToString());
myCommand.Parameters.Add("@flt",thefloat.ToString());
myCommand.Parameters.Add("@srt",shorts.ToString());
myCommand.Parameters.Add("@emp",empcount.ToString());
myCommand.Parameters.Add("@ion",instown.ToString());
myCommand.Connection = MarketDBMS;
try
{
}
{
// silently ignore duplicate record errors
if(e.ErrorCode == -2147467259)
return;
string msg = e.ToString();
if(MessageBox.Show (msg, "Database Error",MessageBoxButtons.OKCancel, MessageBoxIcon.Asterisk) == DialogResult.Cancel)
Application.Exit();
}
catch(Exception e)
{
string msg = e.ToString();
if(MessageBox.Show (msg, "Generic Error",MessageBoxButtons.OKCancel, MessageBoxIcon.Asterisk) == DialogResult.Cancel)
Application.Exit();
}
}
/// <summary>
/// User interface support method used to express a timespan in a string representing hours, minutes, and seconds
/// </summary>
/// <param name="theSpan">Timespan to convert to a string</param>
/// <returns>String in HH:MM:SS form</returns>
private string timespanString(TimeSpan theSpan)
{
int hrs = theSpan.Hours;
int mins = theSpan.Minutes;
int sec = theSpan.Seconds;
return hrs.ToString("00") + ":" + mins.ToString("00") + ":" + sec.ToString("00");
}
/// <summary>
/// Ensure that a field updates its visible display contents right now. The fact this function exists at
/// all is deplorable, as C# should provide some facility for constructing basic macros for operations
/// like this. As the C# doesn't have a preprocessor for reasons I can't begin to fathom, that isn't an
/// option.
/// </summary>
/// <param name="c">The control that is to be updated</param>
private void forceFieldUpdate(Control c)
{
c.Invalidate();c.Update();
}
private int m_statusScrollOffset = 0;
private void updateMessage(string msgtext)
{
int padlen = 60 - msgtext.Length;
string fullmsg = msgtext.PadLeft(padlen - m_statusScrollOffset,' ').PadRight(m_statusScrollOffset);
performanceMsg.Text = fullmsg;
if(++m_statusScrollOffset + msgtext.Length >= 60)
m_statusScrollOffset = 0;
}
/// <summary>
/// Clean up any resources being used.
/// </summary>
protected override void Dispose( bool disposing )
{
if( disposing )
{
if(components != null)
{
components.Dispose();
}
}
base.Dispose( disposing );
}
#region Windows Form Designer generated code
/// <summary>
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
/// </summary>
private void InitializeComponent()
{
this.components = new System.ComponentModel.Container();
this.CurrentTickerDisplay = new System.Windows.Forms.Label();
this.progressBar1 = new System.Windows.Forms.ProgressBar();
this.performanceMsg = new System.Windows.Forms.Label();
this.startButton = new System.Windows.Forms.Button();
this.MarketDBMS = new System.Data.OleDb.OleDbConnection();
this.timer1 = new System.Windows.Forms.Timer(this.components);
this.groupBox1 = new System.Windows.Forms.GroupBox();
this.GetBalanceSheet = new System.Windows.Forms.CheckBox();
this.GetCashFlow = new System.Windows.Forms.CheckBox();
this.GetEquityInfo = new System.Windows.Forms.CheckBox();
this.GetIncomeStatement = new System.Windows.Forms.CheckBox();
this.toolTip1 = new System.Windows.Forms.ToolTip(this.components);
this.CurrentRecordDisplay = new System.Windows.Forms.Label();
this.TotalRecordsDisplay = new System.Windows.Forms.Label();
this.TotalRecoveryDisplay = new System.Windows.Forms.Label();
this.SuccessPercentage = new System.Windows.Forms.Label();
this.SecondsPerRecord = new System.Windows.Forms.Label();
this.ElapsedTimeDisplay = new System.Windows.Forms.Label();
this.RemainingTimeDisplay = new System.Windows.Forms.Label();
this.stopButton = new System.Windows.Forms.Button();
this.exitButton = new System.Windows.Forms.Button();
this.label2 = new System.Windows.Forms.Label();
this.label1 = new System.Windows.Forms.Label();
this.label3 = new System.Windows.Forms.Label();
this.label5 = new System.Windows.Forms.Label();
this.label4 = new System.Windows.Forms.Label();
this.label6 = new System.Windows.Forms.Label();
this.label9 = new System.Windows.Forms.Label();
this.groupBox1.SuspendLayout();
this.SuspendLayout();
//
// CurrentTickerDisplay
//
this.CurrentTickerDisplay.Location = new System.Drawing.Point(8, 200);
this.CurrentTickerDisplay.Name = "CurrentTickerDisplay";
this.CurrentTickerDisplay.Size = new System.Drawing.Size(48, 23);
this.CurrentTickerDisplay.TabIndex = 0;
this.CurrentTickerDisplay.Text = "TICKER";
this.CurrentTickerDisplay.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
this.toolTip1.SetToolTip(this.CurrentTickerDisplay, "Current record ticker symbol");
//
// progressBar1
//
this.progressBar1.Location = new System.Drawing.Point(56, 200);
this.progressBar1.Maximum = 1000;
this.progressBar1.Name = "progressBar1";
this.progressBar1.Size = new System.Drawing.Size(232, 23);
this.progressBar1.TabIndex = 1;
this.toolTip1.SetToolTip(this.progressBar1, "Record download progress display");
//
// performanceMsg
//
this.performanceMsg.Location = new System.Drawing.Point(16, 240);
this.performanceMsg.Name = "performanceMsg";
this.performanceMsg.Size = new System.Drawing.Size(272, 18);
this.performanceMsg.TabIndex = 0;
this.toolTip1.SetToolTip(this.performanceMsg, "Informational Messages");
//
// startButton
//
this.startButton.Enabled = false;
this.startButton.Location = new System.Drawing.Point(16, 272);
this.startButton.Name = "startButton";
this.startButton.TabIndex = 3;
this.startButton.Text = "Start";
this.toolTip1.SetToolTip(this.startButton, "Start/Pause Scraping");
this.startButton.Click += new System.EventHandler(this.startButton_Click);
//
// timer1
//
this.timer1.Tick += new System.EventHandler(this.timer1_Tick);
//
// groupBox1
//
this.groupBox1.Controls.AddRange(new System.Windows.Forms.Control[] {
this.GetBalanceSheet,
this.GetCashFlow,
this.GetEquityInfo,
this.GetIncomeStatement});
this.groupBox1.Location = new System.Drawing.Point(8, 8);
this.groupBox1.Name = "groupBox1";
this.groupBox1.Size = new System.Drawing.Size(272, 88);
this.groupBox1.TabIndex = 5;
this.groupBox1.TabStop = false;
this.groupBox1.Text = "Select Data for Scavenging";
//
// GetBalanceSheet
//
this.GetBalanceSheet.Location = new System.Drawing.Point(16, 24);
this.GetBalanceSheet.Name = "GetBalanceSheet";
this.GetBalanceSheet.TabIndex = 0;
this.GetBalanceSheet.Text = "Balance Sheet";
this.toolTip1.SetToolTip(this.GetBalanceSheet, "Get Balance Sheet Information");
this.GetBalanceSheet.CheckedChanged += new System.EventHandler(this.GetFinancialStatement_CheckedChanged);
//
// GetCashFlow
//
this.GetCashFlow.Location = new System.Drawing.Point(136, 24);
this.GetCashFlow.Name = "GetCashFlow";
this.GetCashFlow.Size = new System.Drawing.Size(120, 24);
this.GetCashFlow.TabIndex = 0;
this.GetCashFlow.Text = "Cash Flow";
this.toolTip1.SetToolTip(this.GetCashFlow, "Get Cash Flow Statement Information");
this.GetCashFlow.CheckedChanged += new System.EventHandler(this.GetFinancialStatement_CheckedChanged);
//
// GetEquityInfo
//
this.GetEquityInfo.Location = new System.Drawing.Point(16, 56);
this.GetEquityInfo.Name = "GetEquityInfo";
this.GetEquityInfo.TabIndex = 0;
this.GetEquityInfo.Text = "Equity Info";
this.toolTip1.SetToolTip(this.GetEquityInfo, "Get Equity Information");
this.GetEquityInfo.CheckedChanged += new System.EventHandler(this.GetFinancialStatement_CheckedChanged);
//
// GetIncomeStatement
//
this.GetIncomeStatement.Location = new System.Drawing.Point(136, 56);
this.GetIncomeStatement.Name = "GetIncomeStatement";
this.GetIncomeStatement.Size = new System.Drawing.Size(120, 24);
this.GetIncomeStatement.TabIndex = 0;
this.GetIncomeStatement.Text = "Income Statement";
this.toolTip1.SetToolTip(this.GetIncomeStatement, "Get Income Statement Information");
this.GetIncomeStatement.CheckedChanged += new System.EventHandler(this.GetFinancialStatement_CheckedChanged);
//
// CurrentRecordDisplay
//
this.CurrentRecordDisplay.Location = new System.Drawing.Point(16, 104);
this.CurrentRecordDisplay.Name = "CurrentRecordDisplay";
this.CurrentRecordDisplay.Size = new System.Drawing.Size(64, 16);
this.CurrentRecordDisplay.TabIndex = 6;
this.CurrentRecordDisplay.Text = "0";
this.CurrentRecordDisplay.TextAlign = System.Drawing.ContentAlignment.MiddleRight;
this.toolTip1.SetToolTip(this.CurrentRecordDisplay, "Number of records checked");
//
// TotalRecordsDisplay
//
this.TotalRecordsDisplay.Location = new System.Drawing.Point(128, 104);
this.TotalRecordsDisplay.Name = "TotalRecordsDisplay";
this.TotalRecordsDisplay.Size = new System.Drawing.Size(72, 16);
this.TotalRecordsDisplay.TabIndex = 6;
this.TotalRecordsDisplay.Text = "0";
this.TotalRecordsDisplay.TextAlign = System.Drawing.ContentAlignment.MiddleLeft;
this.toolTip1.SetToolTip(this.TotalRecordsDisplay, "Total number of records to check");
//
// TotalRecoveryDisplay
//
this.TotalRecoveryDisplay.Location = new System.Drawing.Point(16, 128);
this.TotalRecoveryDisplay.Name = "TotalRecoveryDisplay";
this.TotalRecoveryDisplay.Size = new System.Drawing.Size(64, 16);
this.TotalRecoveryDisplay.TabIndex = 6;
this.TotalRecoveryDisplay.Text = "0";
this.TotalRecoveryDisplay.TextAlign = System.Drawing.ContentAlignment.MiddleRight;
this.toolTip1.SetToolTip(this.TotalRecoveryDisplay, "Number of records successfully updated");
//
// SuccessPercentage
//
this.SuccessPercentage.Location = new System.Drawing.Point(168, 128);
this.SuccessPercentage.Name = "SuccessPercentage";
this.SuccessPercentage.Size = new System.Drawing.Size(48, 16);
this.SuccessPercentage.TabIndex = 6;
this.SuccessPercentage.Text = "0";
this.SuccessPercentage.TextAlign = System.Drawing.ContentAlignment.MiddleRight;
this.toolTip1.SetToolTip(this.SuccessPercentage, "% Successfully Retrieved");
//
// SecondsPerRecord
//
this.SecondsPerRecord.Location = new System.Drawing.Point(96, 184);
this.SecondsPerRecord.Name = "SecondsPerRecord";
this.SecondsPerRecord.Size = new System.Drawing.Size(32, 16);
this.SecondsPerRecord.TabIndex = 6;
this.SecondsPerRecord.Text = "0.00";
this.toolTip1.SetToolTip(this.SecondsPerRecord, "Time to download a single record");
//
// ElapsedTimeDisplay
//
this.ElapsedTimeDisplay.Location = new System.Drawing.Point(16, 160);
this.ElapsedTimeDisplay.Name = "ElapsedTimeDisplay";
this.ElapsedTimeDisplay.Size = new System.Drawing.Size(72, 16);
this.ElapsedTimeDisplay.TabIndex = 6;
this.ElapsedTimeDisplay.Text = "00.00.00";
this.toolTip1.SetToolTip(this.ElapsedTimeDisplay, "Current Run Time of Scraper");
//
// RemainingTimeDisplay
//
this.RemainingTimeDisplay.Location = new System.Drawing.Point(152, 160);
this.RemainingTimeDisplay.Name = "RemainingTimeDisplay";
this.RemainingTimeDisplay.Size = new System.Drawing.Size(64, 16);
this.RemainingTimeDisplay.TabIndex = 6;
this.RemainingTimeDisplay.Text = "00.00.00";
this.toolTip1.SetToolTip(this.RemainingTimeDisplay, "Estimated Remaining Time for current record");
//
// stopButton
//
this.stopButton.Enabled = false;
this.stopButton.Location = new System.Drawing.Point(104, 272);
this.stopButton.Name = "stopButton";
this.stopButton.TabIndex = 3;
this.stopButton.Text = "Stop";
this.toolTip1.SetToolTip(this.stopButton, "Stop Scraping and Reset");
this.stopButton.Click += new System.EventHandler(this.stopButton_Click);
//
// exitButton
//
this.exitButton.Location = new System.Drawing.Point(200, 272);
this.exitButton.Name = "exitButton";
this.exitButton.TabIndex = 3;
this.exitButton.Text = "Exit";
this.toolTip1.SetToolTip(this.exitButton, "Exit Scraper");
this.exitButton.Click += new System.EventHandler(this.exitButton_Click);
//
// label2
//
this.label2.Location = new System.Drawing.Point(96, 104);
this.label2.Name = "label2";
this.label2.Size = new System.Drawing.Size(24, 16);
this.label2.TabIndex = 6;
this.label2.Text = "of";
//
// label1
//
this.label1.Location = new System.Drawing.Point(208, 104);
this.label1.Name = "label1";
this.label1.Size = new System.Drawing.Size(64, 16);
this.label1.TabIndex = 6;
this.label1.Text = "processed.";
//
// label3
//
this.label3.Location = new System.Drawing.Point(88, 128);
this.label3.Name = "label3";
this.label3.Size = new System.Drawing.Size(64, 16);
this.label3.TabIndex = 6;
this.label3.Text = "recovered";
//
// label5
//
this.label5.Location = new System.Drawing.Point(232, 128);
this.label5.Name = "label5";
this.label5.Size = new System.Drawing.Size(72, 16);
this.label5.TabIndex = 6;
this.label5.Text = "% success";
//
// label4
//
this.label4.Location = new System.Drawing.Point(136, 184);
this.label4.Name = "label4";
this.label4.Size = new System.Drawing.Size(64, 16);
this.label4.TabIndex = 6;
this.label4.Text = "secs/record";
//
// label6
//
this.label6.Location = new System.Drawing.Point(96, 160);
this.label6.Name = "label6";
this.label6.Size = new System.Drawing.Size(48, 16);
this.label6.TabIndex = 6;
this.label6.Text = "elapsed";
//
// label9
//
this.label9.Location = new System.Drawing.Point(232, 160);
this.label9.Name = "label9";
this.label9.Size = new System.Drawing.Size(56, 16);
this.label9.TabIndex = 6;
this.label9.Text = "remaining";
//
// WebScraper
//
this.AutoScaleBaseSize = new System.Drawing.Size(5, 13);
this.ClientSize = new System.Drawing.Size(296, 302);
this.Controls.AddRange(new System.Windows.Forms.Control[] {
this.CurrentRecordDisplay,
this.groupBox1,
this.progressBar1,
this.CurrentTickerDisplay,
this.performanceMsg,
this.startButton,
this.label2,
this.TotalRecordsDisplay,
this.label1,
this.TotalRecoveryDisplay,
this.label3,
this.SuccessPercentage,
this.label5,
this.label4,
this.SecondsPerRecord,
this.label6,
this.ElapsedTimeDisplay,
this.RemainingTimeDisplay,
this.label9,
this.stopButton,
this.exitButton});
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.Fixed3D;
this.Name = "WebScraper";
this.Text = "WebScraper";
this.groupBox1.ResumeLayout(false);
this.ResumeLayout(false);
}
#endregion
/// <summary>
/// Method to throw an exception if the page retrieval times out. This prevents the system
/// from waiting indefinitely for a page return. This logic could be extended to retry page
/// requests a finite number of times before timing out
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void timer1_Tick(object sender, System.EventArgs e)
{
throw new Exception("timed out");
}
/// <summary>
/// User interface support routine that controls enabling of the start button. It will
/// enable the start button whenever one or more financial statements are selected for
/// retrieval
/// </summary>
/// <param name="sender">Control that changed</param>
/// <param name="e">Control event information, ignored</param>
private void GetFinancialStatement_CheckedChanged(object sender, System.EventArgs e)
{
if(((CheckBox) sender).Checked)
++m_numStatementsToScrape;
else
--m_numStatementsToScrape;
if(m_numStatementsToScrape > 0)
this.startButton.Enabled = true;
else
this.startButton.Enabled = false;
}
/// <summary>
/// Read in the set of ticker symbols from a file
/// </summary>
/// <param name="theFile">The name of the file containing the ticker symbols</param>
/// <returns>An ArrayList of strings representing the ticker symbols</returns>
private ArrayList loadTickerSymbols(string theFile)
{
// get a filestream for pulling in the ticker data
FileStream tickerData = File.Open("Tickers.txt",FileMode.Open,FileAccess.Read);
// initialize the ticker symbol array
ArrayList syms = new ArrayList();
// get a stream reader to suck the data in from
StreamReader sr = new StreamReader(tickerData);
string theLine;
// read in the set of ticker symbols, one per line. Use
// the trim function to drop any whitespace trying to
// sneak in
while((theLine = sr.ReadLine()) != null)
{
string s = theLine.Trim();
// make sure we don't put any blank lines into the array, that would
// break various future assumptions
if(s.Length > 0)
syms.Add(s);
}
return syms;
}
/// <summary>
/// Randomize the ordering of the ticker symbols. This addresses the fact that
/// some sites may be suspicious of large numbers of queries coming in from a
/// single IP address in sorted order. They may also be suspicious of large numbers
/// of requests from a single IP in any order, but thats another issue that this
/// code does not deal with.
/// </summary>
/// <param name="sourceList"></param>
/// <returns></returns>
private ArrayList randomizeList(ArrayList sourceList)
{
// create a new list to receive the randomized entries
ArrayList reorderedList = new ArrayList();
// fire up a random number generator
Random r = new Random();
// iterate over the collection of items
while(sourceList.Count > 0)
{
if(sourceList.Count == 1)
{
reorderedList.Add(sourceList[0]);
// we could force the while to terminate appropriately, but it's
// easier just to bail out
break;
}
else
{
// this is somewhat sloppy, because we're forcing a large number
// of list reallocations, but it makes the logic pretty straightforward.
// get a random value between 0 and the number of elements remaining to be randomized
int i = r.Next(sourceList.Count);
// use that value to pull a value out of the ordered list and stick it in the
// randomized list
reorderedList.Add(sourceList[i]);
// strip the moved item from the source list so we can use our simpleminded
// random index approach
sourceList.RemoveAt(i);
sourceList.TrimToSize();
}
}
return reorderedList;
}
static void Main()
{
// This combined conditional build and exception handler is a technique I saw
// and like very much. If you're debugging, it just blows up in the debugger, but
// if you're running a retail version, uncaught/unhandled errors get snagged and
// something reasonable appears
#if DEBUG
Application.Run(new WebScraper());
#else
try
{
Application.Run(new WebScraper());
}
catch (Exception e)
{
MessageBox.Show("Something unexpected has happened. Please email " + ProgramSupportEmail + " with this information. \n\n\n" + e.ToString());
}
#endif
}
/// <summary>
/// Handle user stop operations. Dispatches to internal stopScraper function as this isn't the
/// only way in which the scraper may be stopped
/// </summary>
/// <param name="sender"></param>
/// <param name="e"></param>
private void stopButton_Click(object sender, System.EventArgs e)
{
stopScraper();
}
private void stopScraper()
{
MarketDBMS.Close();
this.performanceMsg.Text = "";
m_scraperOperatingMode = ScraperOperatingMode.Stopped;
stopButton.Enabled = false;
startButton.Text = "Start";
startButton.Enabled = (
this.GetBalanceSheet.Checked ||
this.GetCashFlow.Checked ||
this.GetEquityInfo.Checked ||
this.GetIncomeStatement.Checked);
}
private void exitButton_Click(object sender, System.EventArgs e)
{
Application.Exit();
}
}
/// <summary>
/// Data structure that carries information neccessary to parse an individual datum recovered
/// from a scraped web page
/// </summary>
public class ItemParseDirective
{
public ItemParseDirective(WebScraper.FinancialStatements rt,string k,string p,WebScraper.ScrapeDataType dt,int nc)
{
m_recordType = rt;
m_key = k;
m_pattern = p;
m_dataType = dt;
m_numColumns = nc;
}
// statement
public WebScraper.FinancialStatements m_recordType;
// column
public int m_recordID;
// variable key
// parse string
public string m_pattern;
// information type
public WebScraper.ScrapeDataType m_dataType;
// how many columns of data
public int m_numColumns;
}
}