
import urllib.request; #载入urllib.request,用于获取页面html源代码
from pandas import Series; #载入series包
from pandas import DataFrame; #载入dataframe包
from bs4 import BeautifulSoup; #载入beautifulsoup包
import json; #载入json包
response = urllib.request.urlopen('http://item.jd.com/2957726.html'); #获取html源代码
html = response.read(); #将源代码转入html
soup = BeautifulSoup(html); #解析html
data = DataFrame(columns=['Feature', 'Property']) #创建空白dataframe用于装载爬取信息
divSoup = soup.find(id="product-detail-2") #通过分析,发现规格参数所在部分id
trs = divSoup.find_all('tr');
for tr in trs :
tds = tr.find_all('td');
if len(tds)==2: #列表有两个值的时才执行爬取
f=tds[0].getText();
p=tds[1].getText();
data = data.append(
Series(
[f, p],
index=['Feature', 'Property']
), ignore_index=True
);
response = urllib.request.urlopen('http://p.3.cn/prices/get?skuid=J_2244423');
jsonString = response.read();
jsonObject = json.loads(jsonString.decode())
jsonObject[0]['p'] #解析p的值,即价格
df.to_csv("D:\\df.csv"); #导出结果