需求:客户的数据同时存在在另外一个不可控的系统中,需要和当前系统同步。
思路:自动登录另外一个系统,然后抓取数据,同步到本系统中。
技术点:模拟用户登录;保存登录状态;抓取数据
1 /// <summary>
2 /// visit the target url
3 /// </summary>
4 /// <param name="targetURL"></param>
5 /// <param name="cc">this is for keeping cookies and sessions</param>
6 /// <param name="param">this is the data need post inside form</param>
7 /// <returns>html page</returns>
8 public static string PostAndGetHTML(string targetURL,CookieContainer cc,Hashtable param)
9 {
10 //prepare the submit data
11 string formData = "";
12 foreach (DictionaryEntry de in param)
13 {
14 formData += de.Key.ToString() + "=" + de.Value.ToString() + "&";
15 }
16 if (formData.Length > 0)
17 formData = formData.Substring(0, formData.Length - 1); //remove last '&'
18
19 ASCIIEncoding encoding = new ASCIIEncoding();
20 byte[] data = encoding.GetBytes(formData);
21
22 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(targetURL);
23 request.Method = "POST"; //post
24 request.ContentType = "application/x-www-form-urlencoded";
25 request.ContentLength = data.Length;
26 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.1124)";
27
28 Stream newStream = request.GetRequestStream();
29 newStream.Write(data, 0, data.Length);
30
31 newStream.Close();
32
33 request.CookieContainer = cc;
34 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
35 cc.Add(response.Cookies);
36 Stream stream = response.GetResponseStream();
37 string result = new StreamReader(stream, System.Text.Encoding.Default).ReadToEnd();
38 return result;
39 }
40
41 public static DataTable ConvertToDT(DataTable dt, string tableHTML)
42 {
43
44 int lastTD = tableHTML.ToLower().LastIndexOf("</td>");
45 int firstRow = tableHTML.ToLower().IndexOf("<tr") + 3;//after ""<tr
46 int index = tableHTML.ToLower().IndexOf("<tr", firstRow) + 3;//after ""<tr
47 while (index < lastTD)
48 {
49 DataRow dr = dt.NewRow();
50 for (int i = 0; i < dt.Columns.Count; i++)
51 {
52 string value = "";
53 int startTD = tableHTML.ToLower().IndexOf("<td", index) + 3;//after "<td"
54 int endTD = tableHTML.ToLower().IndexOf("</td>", startTD);
55 if (endTD < 0)
56 break;
57 string tdStr = tableHTML.Substring(startTD, endTD - startTD);
58
59 //remove <> and others
60 tdStr = tdStr.Replace(" ", "").Replace("\t", "").Replace("\r", "");
61 string[] v = tdStr.Split('<', '>');
62 for (int j = 0; j < v.Length; j++)
63 {
64 j++;
65 if (v[j].Trim() != "")
66 {
67 value = v[j].Trim();
68 break;
69 }
70 }
71 //
72 dr[i] = value;
73 index = endTD;
74 }
75 dt.Rows.Add(dr);
76
77 }
78 return dt;
79 }
2 /// visit the target url
3 /// </summary>
4 /// <param name="targetURL"></param>
5 /// <param name="cc">this is for keeping cookies and sessions</param>
6 /// <param name="param">this is the data need post inside form</param>
7 /// <returns>html page</returns>
8 public static string PostAndGetHTML(string targetURL,CookieContainer cc,Hashtable param)
9 {
10 //prepare the submit data
11 string formData = "";
12 foreach (DictionaryEntry de in param)
13 {
14 formData += de.Key.ToString() + "=" + de.Value.ToString() + "&";
15 }
16 if (formData.Length > 0)
17 formData = formData.Substring(0, formData.Length - 1); //remove last '&'
18
19 ASCIIEncoding encoding = new ASCIIEncoding();
20 byte[] data = encoding.GetBytes(formData);
21
22 HttpWebRequest request = (HttpWebRequest)WebRequest.Create(targetURL);
23 request.Method = "POST"; //post
24 request.ContentType = "application/x-www-form-urlencoded";
25 request.ContentLength = data.Length;
26 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SV1; .NET CLR 2.0.1124)";
27
28 Stream newStream = request.GetRequestStream();
29 newStream.Write(data, 0, data.Length);
30
31 newStream.Close();
32
33 request.CookieContainer = cc;
34 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
35 cc.Add(response.Cookies);
36 Stream stream = response.GetResponseStream();
37 string result = new StreamReader(stream, System.Text.Encoding.Default).ReadToEnd();
38 return result;
39 }
40
41 public static DataTable ConvertToDT(DataTable dt, string tableHTML)
42 {
43
44 int lastTD = tableHTML.ToLower().LastIndexOf("</td>");
45 int firstRow = tableHTML.ToLower().IndexOf("<tr") + 3;//after ""<tr
46 int index = tableHTML.ToLower().IndexOf("<tr", firstRow) + 3;//after ""<tr
47 while (index < lastTD)
48 {
49 DataRow dr = dt.NewRow();
50 for (int i = 0; i < dt.Columns.Count; i++)
51 {
52 string value = "";
53 int startTD = tableHTML.ToLower().IndexOf("<td", index) + 3;//after "<td"
54 int endTD = tableHTML.ToLower().IndexOf("</td>", startTD);
55 if (endTD < 0)
56 break;
57 string tdStr = tableHTML.Substring(startTD, endTD - startTD);
58
59 //remove <> and others
60 tdStr = tdStr.Replace(" ", "").Replace("\t", "").Replace("\r", "");
61 string[] v = tdStr.Split('<', '>');
62 for (int j = 0; j < v.Length; j++)
63 {
64 j++;
65 if (v[j].Trim() != "")
66 {
67 value = v[j].Trim();
68 break;
69 }
70 }
71 //
72 dr[i] = value;
73 index = endTD;
74 }
75 dt.Rows.Add(dr);
76
77 }
78 return dt;
79 }
这一个是调用的例子:先登录,在查询。 实际中这个逻辑可能有很多步骤
1 CookieContainer cc = new CookieContainer();//this is for keep the Session and Cookie
2 Hashtable param = new Hashtable();//this is for keep post data.
3
4 string urlLogin = "http://demo.server//login.asp";
5 //do find the elementId that needed. check the source of login page can get this information
6 param.Add("User", "xxx");
7 param.Add("Password", "xxxx");
8 string result =GrabHelper.PostAndGetHTML(urlLogin, cc, param);
9 //check result, whether login success
10
11 //if login success, goto the target url, and input some value.
12 string url2 = " http://demo.server/query.asp?id=1";// need change. special logic
13 param.Clear();
14 //param.Add("SearchAreaId","JobId")
15 result = GrabHelper.PostAndGetHTML(url2, cc, new Hashtable());
16 //ConvertToDT the html or do something others
2 Hashtable param = new Hashtable();//this is for keep post data.
3
4 string urlLogin = "http://demo.server//login.asp";
5 //do find the elementId that needed. check the source of login page can get this information
6 param.Add("User", "xxx");
7 param.Add("Password", "xxxx");
8 string result =GrabHelper.PostAndGetHTML(urlLogin, cc, param);
9 //check result, whether login success
10
11 //if login success, goto the target url, and input some value.
12 string url2 = " http://demo.server/query.asp?id=1";// need change. special logic
13 param.Clear();
14 //param.Add("SearchAreaId","JobId")
15 result = GrabHelper.PostAndGetHTML(url2, cc, new Hashtable());
16 //ConvertToDT the html or do something others
注:对于有验证码登录系统的无效。(如果该系统的验证码放到cookie中存储的例外,这个容易破解)
评论列表: